From 548fe51740d0f3294e548f654c099e5aefbf4cb7 Mon Sep 17 00:00:00 2001 From: Madhav Bhatt Date: Thu, 17 Apr 2025 02:45:43 -0700 Subject: firmware: xilinx: Add debugfs support for PM_GET_NODE_STATUS Add new debug interface to support PM_GET_NODE_STATUS to get the node information like requirements and usage. The debugfs firmware driver interface is only meant for testing and debugging EEMI APIs. Hence, it is by-default disabled in production systems. Signed-off-by: Madhav Bhatt Link: https://lore.kernel.org/r/20250417094543.3873507-1-madhav.bhatt@amd.com Signed-off-by: Michal Simek --- include/linux/firmware/xlnx-zynqmp.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index ae48d619c4e0..4699f50465f2 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -3,7 +3,7 @@ * Xilinx Zynq MPSoC Firmware layer * * Copyright (C) 2014-2021 Xilinx - * Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. + * Copyright (C) 2022 - 2025 Advanced Micro Devices, Inc. * * Michal Simek * Davorin Mista @@ -164,6 +164,7 @@ enum pm_api_cb_id { enum pm_api_id { PM_API_FEATURES = 0, PM_GET_API_VERSION = 1, + PM_GET_NODE_STATUS = 3, PM_REGISTER_NOTIFIER = 5, PM_FORCE_POWERDOWN = 8, PM_REQUEST_WAKEUP = 10, @@ -629,6 +630,8 @@ int zynqmp_pm_request_wake(const u32 node, int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode); int zynqmp_pm_set_rpu_mode(u32 node_id, enum rpu_oper_mode rpu_mode); int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mode); +int zynqmp_pm_get_node_status(const u32 node, u32 *const status, + u32 *const requirements, u32 *const usage); int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value); int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config, u32 value); @@ -931,6 +934,13 @@ static inline int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mo return -ENODEV; } +static inline int zynqmp_pm_get_node_status(const u32 node, u32 *const status, + u32 *const requirements, + u32 *const usage) +{ + return -ENODEV; +} + static inline int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value) -- cgit v1.2.3 From e66f4c35e375346943bfe2a0990e97253f74440f Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Tue, 1 Jul 2025 05:38:50 -0700 Subject: drivers: firmware: xilinx: Add unique family code for all platforms The family code is currently derived from the PMC_TAP_IDCODE register value, but there are issues where Versal, Versal NET, and future platforms share the same family code. Additionally for some platforms have identical subfamily code, making it challenging to differentiate between platforms based on the family and subfamily codes. To resolve this, a new family code member is added to the platform data, initialized with unique values. This change enables better platform distinction via the compatible string. Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20250701123851.1314531-3-jay.buddhabhatti@amd.com Signed-off-by: Michal Simek --- drivers/firmware/xilinx/zynqmp.c | 32 +++++++++++++++++++++++++++++--- include/linux/firmware/xlnx-zynqmp.h | 5 +++++ 2 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 53b8c88b4178..2d250a16d3dd 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -72,6 +72,15 @@ struct pm_api_feature_data { struct hlist_node hentry; }; +struct platform_fw_data { + /* + * Family code for platform. + */ + const u32 family_code; +}; + +static struct platform_fw_data *active_platform_fw_data; + static const struct mfd_cell firmware_devs[] = { { .name = "zynqmp_power_controller", @@ -2052,6 +2061,11 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) if (ret) return ret; + /* Get platform-specific firmware data from device tree match */ + active_platform_fw_data = (struct platform_fw_data *)device_get_match_data(dev); + if (!active_platform_fw_data) + return -EINVAL; + /* Get SiP SVC version number */ ret = zynqmp_pm_get_sip_svc_version(&sip_svc_version); if (ret) @@ -2152,10 +2166,22 @@ static void zynqmp_firmware_sync_state(struct device *dev) dev_warn(dev, "failed to release power management to firmware\n"); } +static const struct platform_fw_data platform_fw_data_versal = { + .family_code = PM_VERSAL_FAMILY_CODE, +}; + +static const struct platform_fw_data platform_fw_data_versal_net = { + .family_code = PM_VERSAL_NET_FAMILY_CODE, +}; + +static const struct platform_fw_data platform_fw_data_zynqmp = { + .family_code = PM_ZYNQMP_FAMILY_CODE, +}; + static const struct of_device_id zynqmp_firmware_of_match[] = { - {.compatible = "xlnx,zynqmp-firmware"}, - {.compatible = "xlnx,versal-firmware"}, - {.compatible = "xlnx,versal-net-firmware"}, + {.compatible = "xlnx,zynqmp-firmware", .data = &platform_fw_data_zynqmp}, + {.compatible = "xlnx,versal-firmware", .data = &platform_fw_data_versal}, + {.compatible = "xlnx,versal-net-firmware", .data = &platform_fw_data_versal_net}, {}, }; MODULE_DEVICE_TABLE(of, zynqmp_firmware_of_match); diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 4699f50465f2..6458ef4e04e2 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -54,6 +54,11 @@ #define ZYNQMP_FAMILY_CODE 0x23 #define VERSAL_FAMILY_CODE 0x26 +/* Family codes */ +#define PM_ZYNQMP_FAMILY_CODE 0x1 /* ZynqMP family code */ +#define PM_VERSAL_FAMILY_CODE 0x2 /* Versal family code */ +#define PM_VERSAL_NET_FAMILY_CODE 0x3 /* Versal NET family code */ + /* When all subfamily of platform need to support */ #define ALL_SUB_FAMILY_CODE 0x00 #define VERSAL_SUB_FAMILY_CODE 0x01 -- cgit v1.2.3 From 25e3ae0ce364fa725a6eea68d63d6a2ee09e019f Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Tue, 1 Jul 2025 05:38:51 -0700 Subject: drivers: firmware: xilinx: Switch to new family code in zynqmp_pm_get_family_info() Currently, the family code and subfamily code are derived from the PMC_TAP_IDCODE register. Versal, Versal NET share the same family code. Also some platforms share the same subfamily code, making it difficult to distinguish between platforms. Update zynqmp_pm_get_family_info() to use IDs derived from the compatible string instead of silicon ID codes derived from PMC_TAP_IDCODE register. Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20250701123851.1314531-4-jay.buddhabhatti@amd.com Signed-off-by: Michal Simek --- drivers/firmware/xilinx/zynqmp.c | 42 +++++++++++++-------------------- drivers/pinctrl/pinctrl-zynqmp.c | 7 +++--- drivers/soc/xilinx/xlnx_event_manager.c | 8 +++---- drivers/soc/xilinx/zynqmp_power.c | 10 ++++---- include/linux/firmware/xlnx-zynqmp.h | 15 ++---------- 5 files changed, 31 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 2d250a16d3dd..835a50c5af46 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -473,8 +473,6 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...) static u32 pm_api_version; static u32 pm_tz_version; -static u32 pm_family_code; -static u32 pm_sub_family_code; int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset) { @@ -541,32 +539,18 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_get_chipid); /** * zynqmp_pm_get_family_info() - Get family info of platform * @family: Returned family code value - * @subfamily: Returned sub-family code value * * Return: Returns status, either success or error+reason */ -int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily) +int zynqmp_pm_get_family_info(u32 *family) { - u32 ret_payload[PAYLOAD_ARG_CNT]; - u32 idcode; - int ret; - - /* Check is family or sub-family code already received */ - if (pm_family_code && pm_sub_family_code) { - *family = pm_family_code; - *subfamily = pm_sub_family_code; - return 0; - } + if (!active_platform_fw_data) + return -ENODEV; - ret = zynqmp_pm_invoke_fn(PM_GET_CHIPID, ret_payload, 0); - if (ret < 0) - return ret; + if (!family) + return -EINVAL; - idcode = ret_payload[1]; - pm_family_code = FIELD_GET(FAMILY_CODE_MASK, idcode); - pm_sub_family_code = FIELD_GET(SUB_FAMILY_CODE_MASK, idcode); - *family = pm_family_code; - *subfamily = pm_sub_family_code; + *family = active_platform_fw_data->family_code; return 0; } @@ -1247,8 +1231,13 @@ int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param, u32 value) { int ret; + u32 pm_family_code; + + ret = zynqmp_pm_get_family_info(&pm_family_code); + if (ret) + return ret; - if (pm_family_code == ZYNQMP_FAMILY_CODE && + if (pm_family_code == PM_ZYNQMP_FAMILY_CODE && param == PM_PINCTRL_CONFIG_TRI_STATE) { ret = zynqmp_pm_feature(PM_PINCTRL_CONFIG_PARAM_SET); if (ret < PM_PINCTRL_PARAM_SET_VERSION) { @@ -2055,6 +2044,7 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct zynqmp_devinfo *devinfo; + u32 pm_family_code; int ret; ret = get_set_conduit_method(dev->of_node); @@ -2098,8 +2088,8 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) pr_info("%s Platform Management API v%d.%d\n", __func__, pm_api_version >> 16, pm_api_version & 0xFFFF); - /* Get the Family code and sub family code of platform */ - ret = zynqmp_pm_get_family_info(&pm_family_code, &pm_sub_family_code); + /* Get the Family code of platform */ + ret = zynqmp_pm_get_family_info(&pm_family_code); if (ret < 0) return ret; @@ -2126,7 +2116,7 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) zynqmp_pm_api_debugfs_init(); - if (pm_family_code == VERSAL_FAMILY_CODE) { + if (pm_family_code != PM_ZYNQMP_FAMILY_CODE) { em_dev = platform_device_register_data(&pdev->dev, "xlnx_event_manager", -1, NULL, 0); if (IS_ERR(em_dev)) diff --git a/drivers/pinctrl/pinctrl-zynqmp.c b/drivers/pinctrl/pinctrl-zynqmp.c index fddf0fef4b13..71eaac81deb1 100644 --- a/drivers/pinctrl/pinctrl-zynqmp.c +++ b/drivers/pinctrl/pinctrl-zynqmp.c @@ -100,7 +100,6 @@ struct zynqmp_pctrl_group { static struct pinctrl_desc zynqmp_desc; static u32 family_code; -static u32 sub_family_code; static int zynqmp_pctrl_get_groups_count(struct pinctrl_dev *pctldev) { @@ -605,7 +604,7 @@ static int zynqmp_pinctrl_prepare_func_groups(struct device *dev, u32 fid, return -ENOMEM; for (pin = 0; pin < groups[resp[i]].npins; pin++) { - if (family_code == ZYNQMP_FAMILY_CODE) + if (family_code == PM_ZYNQMP_FAMILY_CODE) __set_bit(groups[resp[i]].pins[pin], used_pins); else __set_bit((u8)groups[resp[i]].pins[pin] - 1, used_pins); @@ -958,11 +957,11 @@ static int zynqmp_pinctrl_probe(struct platform_device *pdev) if (!pctrl) return -ENOMEM; - ret = zynqmp_pm_get_family_info(&family_code, &sub_family_code); + ret = zynqmp_pm_get_family_info(&family_code); if (ret < 0) return ret; - if (family_code == ZYNQMP_FAMILY_CODE) { + if (family_code == PM_ZYNQMP_FAMILY_CODE) { ret = zynqmp_pinctrl_prepare_pin_desc(&pdev->dev, &zynqmp_desc.pins, &zynqmp_desc.npins); } else { diff --git a/drivers/soc/xilinx/xlnx_event_manager.c b/drivers/soc/xilinx/xlnx_event_manager.c index a572d15f6161..6fdf4d14b7e7 100644 --- a/drivers/soc/xilinx/xlnx_event_manager.c +++ b/drivers/soc/xilinx/xlnx_event_manager.c @@ -77,17 +77,17 @@ struct registered_event_data { static bool xlnx_is_error_event(const u32 node_id) { - u32 pm_family_code, pm_sub_family_code; + u32 pm_family_code; - zynqmp_pm_get_family_info(&pm_family_code, &pm_sub_family_code); + zynqmp_pm_get_family_info(&pm_family_code); - if (pm_sub_family_code == VERSAL_SUB_FAMILY_CODE) { + if (pm_family_code == PM_VERSAL_FAMILY_CODE) { if (node_id == VERSAL_EVENT_ERROR_PMC_ERR1 || node_id == VERSAL_EVENT_ERROR_PMC_ERR2 || node_id == VERSAL_EVENT_ERROR_PSM_ERR1 || node_id == VERSAL_EVENT_ERROR_PSM_ERR2) return true; - } else { + } else if (pm_family_code == PM_VERSAL_NET_FAMILY_CODE) { if (node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR1 || node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR2 || node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR3 || diff --git a/drivers/soc/xilinx/zynqmp_power.c b/drivers/soc/xilinx/zynqmp_power.c index ae59bf16659a..9b7b2858b22a 100644 --- a/drivers/soc/xilinx/zynqmp_power.c +++ b/drivers/soc/xilinx/zynqmp_power.c @@ -285,7 +285,7 @@ static int register_event(struct device *dev, const enum pm_api_cb_id cb_type, c static int zynqmp_pm_probe(struct platform_device *pdev) { int ret, irq; - u32 pm_api_version, pm_family_code, pm_sub_family_code, node_id; + u32 pm_api_version, pm_family_code, node_id; struct mbox_client *client; ret = zynqmp_pm_get_api_version(&pm_api_version); @@ -315,14 +315,16 @@ static int zynqmp_pm_probe(struct platform_device *pdev) INIT_WORK(&zynqmp_pm_init_suspend_work->callback_work, zynqmp_pm_init_suspend_work_fn); - ret = zynqmp_pm_get_family_info(&pm_family_code, &pm_sub_family_code); + ret = zynqmp_pm_get_family_info(&pm_family_code); if (ret < 0) return ret; - if (pm_sub_family_code == VERSALNET_SUB_FAMILY_CODE) + if (pm_family_code == PM_VERSAL_NET_FAMILY_CODE) node_id = PM_DEV_ACPU_0_0; - else + else if (pm_family_code == PM_VERSAL_FAMILY_CODE) node_id = PM_DEV_ACPU_0; + else + return -ENODEV; ret = register_event(&pdev->dev, PM_NOTIFY_CB, node_id, EVENT_SUBSYSTEM_RESTART, false, subsystem_restart_event_callback); diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 6458ef4e04e2..be6817ac5120 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -51,22 +51,11 @@ #define PM_PINCTRL_PARAM_SET_VERSION 2 -#define ZYNQMP_FAMILY_CODE 0x23 -#define VERSAL_FAMILY_CODE 0x26 - /* Family codes */ #define PM_ZYNQMP_FAMILY_CODE 0x1 /* ZynqMP family code */ #define PM_VERSAL_FAMILY_CODE 0x2 /* Versal family code */ #define PM_VERSAL_NET_FAMILY_CODE 0x3 /* Versal NET family code */ -/* When all subfamily of platform need to support */ -#define ALL_SUB_FAMILY_CODE 0x00 -#define VERSAL_SUB_FAMILY_CODE 0x01 -#define VERSALNET_SUB_FAMILY_CODE 0x03 - -#define FAMILY_CODE_MASK GENMASK(27, 21) -#define SUB_FAMILY_CODE_MASK GENMASK(20, 19) - #define API_ID_MASK GENMASK(7, 0) #define MODULE_ID_MASK GENMASK(11, 8) #define PLM_MODULE_ID_MASK GENMASK(15, 8) @@ -570,7 +559,7 @@ int zynqmp_pm_invoke_fw_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...); #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) int zynqmp_pm_get_api_version(u32 *version); int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); -int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily); +int zynqmp_pm_get_family_info(u32 *family); int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); int zynqmp_pm_clock_enable(u32 clock_id); int zynqmp_pm_clock_disable(u32 clock_id); @@ -651,7 +640,7 @@ static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version) return -ENODEV; } -static inline int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily) +static inline int zynqmp_pm_get_family_info(u32 *family) { return -ENODEV; } -- cgit v1.2.3 From 93a4b36ef3cf4ce5e6a7e7a7686181de76e246a1 Mon Sep 17 00:00:00 2001 From: Nirbhay Sharma Date: Fri, 3 Oct 2025 17:15:55 +0530 Subject: cgroup: Fix seqcount lockdep assertion in cgroup freezer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit afa3701c0e45 ("cgroup: cgroup.stat.local time accounting") introduced a seqcount to track freeze timing but initialized it as a plain seqcount_t using seqcount_init(). However, the write-side critical section in cgroup_do_freeze() holds the css_set_lock spinlock while calling write_seqcount_begin(). On PREEMPT_RT kernels, spinlocks do not disable preemption, causing the lockdep assertion for a plain seqcount_t, which checks for preemption being disabled, to fail. This triggers the following warning: WARNING: CPU: 0 PID: 9692 at include/linux/seqlock.h:221 Fix this by changing the type to seqcount_spinlock_t and initializing it with seqcount_spinlock_init() to associate css_set_lock with the seqcount. This allows lockdep to correctly validate that the spinlock is held during write operations, resolving the assertion failure on all kernel configurations. Reported-by: syzbot+27a2519eb4dad86d0156@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=27a2519eb4dad86d0156 Fixes: afa3701c0e45 ("cgroup: cgroup.stat.local time accounting") Signed-off-by: Nirbhay Sharma Link: https://lore.kernel.org/r/20251002165510.KtY3IT--@linutronix.de/ Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 2 +- kernel/cgroup/cgroup.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 93318fce31f3..b760a3c470a5 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -452,7 +452,7 @@ struct cgroup_freezer_state { int nr_frozen_tasks; /* Freeze time data consistency protection */ - seqcount_t freeze_seq; + seqcount_spinlock_t freeze_seq; /* * Most recent time the cgroup was requested to freeze. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6ae5f48cf64e..fdee387f0d6b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5892,7 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; - seqcount_init(&cgrp->freezer.freeze_seq); + seqcount_spinlock_init(&cgrp->freezer.freeze_seq, &css_set_lock); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be -- cgit v1.2.3 From 48b77733d0dbaf8cd0a122712072f92b2d95d894 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Oct 2025 15:19:07 +0200 Subject: expfs: Fix exportfs_can_encode_fh() for EXPORT_FH_FID After commit 5402c4d4d200 ("exportfs: require ->fh_to_parent() to encode connectable file handles") we will fail to create non-decodable file handles for filesystems without export operations. Fix it. Fixes: 5402c4d4d200 ("exportfs: require ->fh_to_parent() to encode connectable file handles") Reviewed-by: Christian Brauner Reviewed-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/exportfs.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index d0cf10d5e0f7..f0cf2714ec52 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -320,9 +320,6 @@ static inline bool exportfs_can_decode_fh(const struct export_operations *nop) static inline bool exportfs_can_encode_fh(const struct export_operations *nop, int fh_flags) { - if (!nop) - return false; - /* * If a non-decodeable file handle was requested, we only need to make * sure that filesystem did not opt-out of encoding fid. @@ -330,6 +327,10 @@ static inline bool exportfs_can_encode_fh(const struct export_operations *nop, if (fh_flags & EXPORT_FH_FID) return exportfs_can_encode_fid(nop); + /* Normal file handles cannot be created without export ops */ + if (!nop) + return false; + /* * If a connectable file handle was requested, we need to make sure that * filesystem can also decode connected file handles. -- cgit v1.2.3 From f233d4855918547f19c5bff95223706d1c836b7c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 7 Oct 2025 22:03:48 +0000 Subject: bpf: Refactor storage_get_func_atomic to generic non_sleepable flag Rename the storage_get_func_atomic flag to a more generic non_sleepable flag that tracks whether a helper or kfunc may be called from a non-sleepable context. This makes the flag more broadly applicable beyond just storage_get helpers. See [0] for more context. The flag is now set unconditionally for all helpers and kfuncs when: - RCU critical section is active. - Preemption is disabled. - IRQs are disabled. - In a non-sleepable context within a sleepable program (e.g., timer callbacks), which is indicated by !in_sleepable(). Previously, the flag was only set for storage_get helpers in these contexts. With this change, it can be used by any code that needs to differentiate between sleepable and non-sleepable contexts at the per-instruction level. The existing usage in do_misc_fixups() for storage_get helpers is preserved by checking is_storage_get_function() before using the flag. [0]: https://lore.kernel.org/bpf/CAP01T76cbaNi4p-y8E0sjE2NXSra2S=Uja8G4hSQDu_SbXxREQ@mail.gmail.com Cc: Mykyta Yatsenko Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Eduard Zingerman Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20251007220349.3852807-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 33 +++++++++++++++++---------------- 2 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4c497e839526..b57222a25a4a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -548,7 +548,7 @@ struct bpf_insn_aux_data { bool nospec_result; /* result is unsafe under speculation, nospec must follow */ bool zext_dst; /* this insn zero extends dst reg */ bool needs_zext; /* alu op needs to clear upper bits */ - bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ + bool non_sleepable; /* helper/kfunc may be called from non-sleepable context */ bool is_iter_next; /* bpf_iter__next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 32123c4b041a..85a953124412 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11371,6 +11371,15 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return *ptr && (*ptr)->func ? 0 : -EINVAL; } +/* Check if we're in a sleepable context. */ +static inline bool in_sleepable_context(struct bpf_verifier_env *env) +{ + return !env->cur_state->active_rcu_lock && + !env->cur_state->active_preempt_locks && + !env->cur_state->active_irq_id && + in_sleepable(env); +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -11437,9 +11446,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } if (env->cur_state->active_preempt_locks) { @@ -11448,9 +11454,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } if (env->cur_state->active_irq_id) { @@ -11459,17 +11462,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } - /* - * Non-sleepable contexts in sleepable programs (e.g., timer callbacks) - * are atomic and must use GFP_ATOMIC for storage_get helpers. - */ - if (!in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + /* Track non-sleepable context for helpers. */ + if (!in_sleepable_context(env)) + env->insn_aux_data[insn_idx].non_sleepable = true; meta.func_id = func_id; /* check args */ @@ -13880,6 +13877,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + /* Track non-sleepable context for kfuncs, same as for helpers. */ + if (!in_sleepable_context(env)) + insn_aux->non_sleepable = true; + /* Check the arguments */ err = check_kfunc_args(env, &meta, insn_idx); if (err < 0) @@ -22502,7 +22503,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } if (is_storage_get_function(insn->imm)) { - if (env->insn_aux_data[i + delta].storage_get_func_atomic) + if (env->insn_aux_data[i + delta].non_sleepable) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); else insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); -- cgit v1.2.3 From 4c97c4b149a019a3b318dc6ea3dc96efe0ee1f39 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 10 Oct 2025 17:46:06 +0100 Subject: bpf: Extract internal structs validation logic into helpers The arraymap and hashtab duplicate the logic that checks for and frees internal structs (timer, workqueue, task_work) based on BTF record flags. Centralize this by introducing two helpers: * bpf_map_has_internal_structs(map) Returns true if the map value contains any of internal structs: BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK. * bpf_map_free_internal_structs(map, obj) Frees the internal structs for a single value object. Convert arraymap and both the prealloc/malloc hashtab paths to use the new generic functions. This keeps the functionality for when/how to free these special fields in one place and makes it easier to add support for new internal structs in the future without touching every map implementation. Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251010164606.147298-3-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ kernel/bpf/arraymap.c | 19 ++++++------------- kernel/bpf/hashtab.c | 36 +++++++++++++----------------------- kernel/bpf/helpers.c | 10 ++++++++++ 4 files changed, 36 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a98c83346134..f87fb203aaae 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -663,6 +663,13 @@ int map_check_no_btf(const struct bpf_map *map, bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1); +static inline bool bpf_map_has_internal_structs(struct bpf_map *map) +{ + return btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK); +} + +void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); + extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 80b1765a3159..0ba790c2d2e5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -448,19 +448,12 @@ static void array_map_free_internal_structs(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free fields other than timer and workqueue - * on uref dropping to zero. - */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { - for (i = 0; i < array->map.max_entries; i++) { - if (btf_record_has_field(map->record, BPF_TIMER)) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); - if (btf_record_has_field(map->record, BPF_TASK_WORK)) - bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); - } - } + /* We only free internal structs on uref dropping to zero */ + if (!bpf_map_has_internal_structs(map)) + return; + + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c2fcd0cd51e5..e7a6ba04dc82 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -215,19 +215,6 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } -static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem) -{ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) - bpf_obj_free_timer(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) - bpf_obj_free_task_work(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); -} - static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; @@ -240,7 +227,8 @@ static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - htab_free_internal_structs(htab, elem); + bpf_map_free_internal_structs(&htab->map, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -1509,8 +1497,9 @@ static void htab_free_malloced_internal_structs(struct bpf_htab *htab) struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { - /* We only free timer on uref dropping to zero */ - htab_free_internal_structs(htab, l); + /* We only free internal structs on uref dropping to zero */ + bpf_map_free_internal_structs(&htab->map, + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } @@ -1521,13 +1510,14 @@ static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { - if (!htab_is_prealloc(htab)) - htab_free_malloced_internal_structs(htab); - else - htab_free_prealloced_internal_structs(htab); - } + /* We only free internal structs on uref dropping to zero */ + if (!bpf_map_has_internal_structs(map)) + return; + + if (htab_is_prealloc(htab)) + htab_free_prealloced_internal_structs(htab); + else + htab_free_malloced_internal_structs(htab); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 485f65fbd97f..dea8443f782c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4487,3 +4487,13 @@ void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) return NULL; return (void *)__bpf_dynptr_data(ptr, len); } + +void bpf_map_free_internal_structs(struct bpf_map *map, void *val) +{ + if (btf_record_has_field(map->record, BPF_TIMER)) + bpf_obj_free_timer(map->record, val); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(map->record, val); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, val); +} -- cgit v1.2.3 From ed4a5c5de56ad4e23c9e5da8981639352b63b8ac Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Tue, 23 Sep 2025 18:16:07 +0000 Subject: usb: typec: class: add typec_get_data_role symbol Alt Mode drivers are responsible for sending Enter Mode through the TCPM, but only a DFP is allowed to send Enter Mode. typec_get_data_role gets the port's data role, which can then be used in altmode drivers via typec_altmode_get_data_role to know if Enter Mode should be sent. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250923181606.1583584-5-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/class.c | 13 +++++++++++++ include/linux/usb/typec.h | 1 + include/linux/usb/typec_altmode.h | 13 +++++++++++++ 3 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index 67a533e35150..9b2647cb199b 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -2120,6 +2120,19 @@ void typec_set_data_role(struct typec_port *port, enum typec_data_role role) } EXPORT_SYMBOL_GPL(typec_set_data_role); +/** + * typec_get_data_role - Get port data role + * @port: The USB Type-C Port to query + * + * This routine is used by the altmode drivers to determine if the port is the + * DFP before issuing Enter Mode + */ +enum typec_data_role typec_get_data_role(struct typec_port *port) +{ + return port->data_role; +} +EXPORT_SYMBOL_GPL(typec_get_data_role); + /** * typec_set_pwr_role - Report power role change * @port: The USB Type-C Port where the role was changed diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index 252af3f77039..309251572e2e 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -337,6 +337,7 @@ struct typec_plug *typec_register_plug(struct typec_cable *cable, void typec_unregister_plug(struct typec_plug *plug); void typec_set_data_role(struct typec_port *port, enum typec_data_role role); +enum typec_data_role typec_get_data_role(struct typec_port *port); void typec_set_pwr_role(struct typec_port *port, enum typec_role role); void typec_set_vconn_role(struct typec_port *port, enum typec_role role); void typec_set_pwr_opmode(struct typec_port *port, enum typec_pwr_opmode mode); diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h index b3c0866ea70f..f7db3bd4c90e 100644 --- a/include/linux/usb/typec_altmode.h +++ b/include/linux/usb/typec_altmode.h @@ -172,6 +172,19 @@ typec_altmode_get_svdm_version(struct typec_altmode *altmode) return typec_get_negotiated_svdm_version(typec_altmode2port(altmode)); } +/** + * typec_altmode_get_data_role - Get port data role + * @altmode: Handle to the alternate mode + * + * Alt Mode drivers should only issue Enter Mode through the port if they are + * the DFP. + */ +static inline enum typec_data_role +typec_altmode_get_data_role(struct typec_altmode *altmode) +{ + return typec_get_data_role(typec_altmode2port(altmode)); +} + /** * struct typec_altmode_driver - USB Type-C alternate mode device driver * @id_table: Null terminated array of SVIDs -- cgit v1.2.3 From 536bf30d282a6b2f676c6106587f0e1946449aca Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:53 -0500 Subject: iio: buffer: document iio_push_to_buffers_with_ts() Document the iio_push_to_buffers_with_ts() function. This is copied and slightly cleaned up from iio_push_to_buffers_with_timestamp(). Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index 5c84ec4a9810..e46b818981aa 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -45,6 +45,22 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev, return iio_push_to_buffers(indio_dev, data); } +/** + * iio_push_to_buffers_with_ts() - push data and timestamp to buffers + * @indio_dev: iio_dev structure for device. + * @data: Pointer to sample data buffer. + * @data_total_len: The size of @data in bytes. + * @timestamp: Timestamp for the sample data. + * + * Pushes data to the IIO device's buffers. If timestamps are enabled for the + * device the function will store the supplied timestamp as the last element in + * the sample data buffer before pushing it to the device buffers. The sample + * data buffer needs to be large enough to hold the additional timestamp + * (usually the buffer should be at least indio->scan_bytes bytes large). + * + * Context: Any context. + * Return: 0 on success, a negative error code otherwise. + */ static inline int iio_push_to_buffers_with_ts(struct iio_dev *indio_dev, void *data, size_t data_total_len, s64 timestamp) -- cgit v1.2.3 From 4992ce003b76ee1629ad4e7332a49ea2619e7523 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:54 -0500 Subject: iio: buffer: deprecated iio_push_to_buffers_with_timestamp() Replace the documentation of iio_push_to_buffers_with_timestamp() with a deprecation notice pointing to the preferred alternative. Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index e46b818981aa..d37f82678f71 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -26,11 +26,7 @@ int iio_pop_from_buffer(struct iio_buffer *buffer, void *data); * @data: sample data * @timestamp: timestamp for the sample data * - * Pushes data to the IIO device's buffers. If timestamps are enabled for the - * device the function will store the supplied timestamp as the last element in - * the sample data buffer before pushing it to the device buffers. The sample - * data buffer needs to be large enough to hold the additional timestamp - * (usually the buffer should be indio->scan_bytes bytes large). + * DEPRECATED: Use iio_push_to_buffers_with_ts() instead. * * Returns 0 on success, a negative error code otherwise. */ -- cgit v1.2.3 From 748ed9fc8596015e7e136877465919b89c7d08d6 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:56 -0500 Subject: iio: buffer: document store_to() callback may be called in any context Document that the struct iio_buffer_access_funcs.store_to() callback must be safe to call from any context since it is called from iio_push_to_buffer() which may be called from any context. Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer_impl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index e72552e026f3..0daff9ff20ce 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -24,7 +24,8 @@ struct sg_table; /** * struct iio_buffer_access_funcs - access functions for buffers. - * @store_to: actually store stuff to the buffer + * @store_to: actually store stuff to the buffer - must be safe to + * call from any context (e.g. must not sleep). * @read: try to get a specified number of bytes (must exist) * @data_available: indicates how much data is available for reading from * the buffer. -- cgit v1.2.3 From 592ae0ccecfac9af8f67444cab11cbb11770f571 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:57 -0500 Subject: iio: buffer: document that buffer callback must be context safe Document that the callback registered with iio_channel_get_all_cb() must be safe to call from any context since it is called from by iio_push_to_buffer() which can be called in any context. Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- drivers/iio/buffer/industrialio-buffer-cb.c | 1 + include/linux/iio/consumer.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/iio/buffer/industrialio-buffer-cb.c b/drivers/iio/buffer/industrialio-buffer-cb.c index 3e27385069ed..f4ebff968493 100644 --- a/drivers/iio/buffer/industrialio-buffer-cb.c +++ b/drivers/iio/buffer/industrialio-buffer-cb.c @@ -13,6 +13,7 @@ struct iio_cb_buffer { struct iio_buffer buffer; + /* Must be safe to call from any context (e.g. must not sleep). */ int (*cb)(const void *data, void *private); void *private; struct iio_channel *channels; diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index a38b277c2c02..5039558267e4 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -131,7 +131,8 @@ struct iio_cb_buffer; /** * iio_channel_get_all_cb() - register callback for triggered capture * @dev: Pointer to client device. - * @cb: Callback function. + * @cb: Callback function. Must be safe to call from any context + * (e.g. must not sleep). * @private: Private data passed to callback. * * NB right now we have no ability to mux data from multiple devices. -- cgit v1.2.3 From a514bb109eada64f798f1c86c17182229cc20fe7 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Tue, 7 Oct 2025 10:15:21 +0100 Subject: iio: buffer: support getting dma channel from the buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new buffer accessor .get_dma_dev() in order to get the struct device responsible for actually providing the dma channel. We cannot assume that we can use the parent of the IIO device for mapping the DMA buffer. This becomes important on systems (like the Xilinx/AMD zynqMP Ultrascale) where memory (or part of it) is mapped above the 32 bit range. On such systems and given that a device by default has a dma mask of 32 bits we would then need to rely on bounce buffers (to swiotlb) for mapping memory above the dma mask limit. In the process, add an iio_buffer_get_dma_dev() helper function to get the proper DMA device. Cc: stable@vger.kernel.org Reviewed-by: David Lechner Signed-off-by: Nuno Sá Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-buffer.c | 21 ++++++++++++++++----- include/linux/iio/buffer_impl.h | 2 ++ 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c index a80f7cc25a27..96ea0f039dfb 100644 --- a/drivers/iio/industrialio-buffer.c +++ b/drivers/iio/industrialio-buffer.c @@ -1623,19 +1623,28 @@ static int iio_dma_resv_lock(struct dma_buf *dmabuf, bool nonblock) return 0; } +static struct device *iio_buffer_get_dma_dev(const struct iio_dev *indio_dev, + struct iio_buffer *buffer) +{ + if (buffer->access->get_dma_dev) + return buffer->access->get_dma_dev(buffer); + + return indio_dev->dev.parent; +} + static struct dma_buf_attachment * iio_buffer_find_attachment(struct iio_dev_buffer_pair *ib, struct dma_buf *dmabuf, bool nonblock) { - struct device *dev = ib->indio_dev->dev.parent; struct iio_buffer *buffer = ib->buffer; + struct device *dma_dev = iio_buffer_get_dma_dev(ib->indio_dev, buffer); struct dma_buf_attachment *attach = NULL; struct iio_dmabuf_priv *priv; guard(mutex)(&buffer->dmabufs_mutex); list_for_each_entry(priv, &buffer->dmabufs, entry) { - if (priv->attach->dev == dev + if (priv->attach->dev == dma_dev && priv->attach->dmabuf == dmabuf) { attach = priv->attach; break; @@ -1653,6 +1662,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib, { struct iio_dev *indio_dev = ib->indio_dev; struct iio_buffer *buffer = ib->buffer; + struct device *dma_dev = iio_buffer_get_dma_dev(indio_dev, buffer); struct dma_buf_attachment *attach; struct iio_dmabuf_priv *priv, *each; struct dma_buf *dmabuf; @@ -1679,7 +1689,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib, goto err_free_priv; } - attach = dma_buf_attach(dmabuf, indio_dev->dev.parent); + attach = dma_buf_attach(dmabuf, dma_dev); if (IS_ERR(attach)) { err = PTR_ERR(attach); goto err_dmabuf_put; @@ -1719,7 +1729,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib, * combo. If we do, refuse to attach. */ list_for_each_entry(each, &buffer->dmabufs, entry) { - if (each->attach->dev == indio_dev->dev.parent + if (each->attach->dev == dma_dev && each->attach->dmabuf == dmabuf) { /* * We unlocked the reservation object, so going through @@ -1758,6 +1768,7 @@ static int iio_buffer_detach_dmabuf(struct iio_dev_buffer_pair *ib, { struct iio_buffer *buffer = ib->buffer; struct iio_dev *indio_dev = ib->indio_dev; + struct device *dma_dev = iio_buffer_get_dma_dev(indio_dev, buffer); struct iio_dmabuf_priv *priv; struct dma_buf *dmabuf; int dmabuf_fd, ret = -EPERM; @@ -1772,7 +1783,7 @@ static int iio_buffer_detach_dmabuf(struct iio_dev_buffer_pair *ib, guard(mutex)(&buffer->dmabufs_mutex); list_for_each_entry(priv, &buffer->dmabufs, entry) { - if (priv->attach->dev == indio_dev->dev.parent + if (priv->attach->dev == dma_dev && priv->attach->dmabuf == dmabuf) { list_del(&priv->entry); diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index e72552e026f3..8d770ced66b2 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -50,6 +50,7 @@ struct sg_table; * @enqueue_dmabuf: called from userspace via ioctl to queue this DMABUF * object to this buffer. Requires a valid DMABUF fd, that * was previouly attached to this buffer. + * @get_dma_dev: called to get the DMA channel associated with this buffer. * @lock_queue: called when the core needs to lock the buffer queue; * it is used when enqueueing DMABUF objects. * @unlock_queue: used to unlock a previously locked buffer queue @@ -90,6 +91,7 @@ struct iio_buffer_access_funcs { struct iio_dma_buffer_block *block, struct dma_fence *fence, struct sg_table *sgt, size_t size, bool cyclic); + struct device * (*get_dma_dev)(struct iio_buffer *buffer); void (*lock_queue)(struct iio_buffer *buffer); void (*unlock_queue)(struct iio_buffer *buffer); -- cgit v1.2.3 From f9c198c3ccaf90a1a265fb2ffa8d4b093c3b0784 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Tue, 7 Oct 2025 10:15:22 +0100 Subject: iio: buffer-dma: support getting the DMA channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the .get_dma_dev() callback for DMA buffers by returning the device that owns the DMA channel. This allows the core DMABUF infrastructure to properly map DMA buffers using the correct device, avoiding the need for bounce buffers on systems where memory is mapped above the 32-bit range. The function returns the DMA queue's device, which is the actual device responsible for DMA operations in buffer-dma implementations. Cc: stable@vger.kernel.org Reviewed-by: David Lechner Signed-off-by: Nuno Sá Signed-off-by: Jonathan Cameron --- drivers/iio/buffer/industrialio-buffer-dma.c | 6 ++++++ include/linux/iio/buffer-dma.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c index ee294a775e8a..7a7a9d37339b 100644 --- a/drivers/iio/buffer/industrialio-buffer-dma.c +++ b/drivers/iio/buffer/industrialio-buffer-dma.c @@ -786,6 +786,12 @@ out_end_signalling: } EXPORT_SYMBOL_NS_GPL(iio_dma_buffer_enqueue_dmabuf, "IIO_DMA_BUFFER"); +struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer) +{ + return iio_buffer_to_queue(buffer)->dev; +} +EXPORT_SYMBOL_NS_GPL(iio_dma_buffer_get_dma_dev, "IIO_DMA_BUFFER"); + void iio_dma_buffer_lock_queue(struct iio_buffer *buffer) { struct iio_dma_buffer_queue *queue = iio_buffer_to_queue(buffer); diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h index 5eb66a399002..4f33e6a39797 100644 --- a/include/linux/iio/buffer-dma.h +++ b/include/linux/iio/buffer-dma.h @@ -174,5 +174,6 @@ int iio_dma_buffer_enqueue_dmabuf(struct iio_buffer *buffer, size_t size, bool cyclic); void iio_dma_buffer_lock_queue(struct iio_buffer *buffer); void iio_dma_buffer_unlock_queue(struct iio_buffer *buffer); +struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer); #endif -- cgit v1.2.3 From 11fb1a82aefa6f7fea6ac82334edb5639b9927df Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Tue, 23 Sep 2025 16:09:27 +0100 Subject: firmware: arm_ffa: Add support for IMPDEF value in the memory access descriptor FF-A v1.2 introduced 16 byte IMPLEMENTATION DEFINED value in the endpoint memory access descriptor to allow any sender could to specify an its any custom value for each receiver. Also this value must be specified by the receiver when retrieving the memory region. The sender must ensure it informs the receiver of this value via an IMPLEMENTATION DEFINED mechanism such as a partition message. So the FF-A driver can use the message interfaces to communicate the value and set the same in the ffa_mem_region_attributes structures when using the memory interfaces. The driver ensure that the size of the endpoint memory access descriptors is set correctly based on the FF-A version. Fixes: 9fac08d9d985 ("firmware: arm_ffa: Upgrade FF-A version to v1.2 in the driver") Reported-by: Lixiang Mao Tested-by: Lixiang Mao Message-Id: <20250923150927.1218364-1-sudeep.holla@arm.com> Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/driver.c | 37 +++++++++++++++++++++++++++---------- include/linux/arm_ffa.h | 21 +++++++++++++++++++-- 2 files changed, 46 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c index 65bf1685350a..c72ee4756585 100644 --- a/drivers/firmware/arm_ffa/driver.c +++ b/drivers/firmware/arm_ffa/driver.c @@ -649,6 +649,26 @@ static u16 ffa_memory_attributes_get(u32 func_id) return FFA_MEM_NORMAL | FFA_MEM_WRITE_BACK | FFA_MEM_INNER_SHAREABLE; } +static void ffa_emad_impdef_value_init(u32 version, void *dst, void *src) +{ + struct ffa_mem_region_attributes *ep_mem_access; + + if (FFA_EMAD_HAS_IMPDEF_FIELD(version)) + memcpy(dst, src, sizeof(ep_mem_access->impdef_val)); +} + +static void +ffa_mem_region_additional_setup(u32 version, struct ffa_mem_region *mem_region) +{ + if (!FFA_MEM_REGION_HAS_EP_MEM_OFFSET(version)) { + mem_region->ep_mem_size = 0; + } else { + mem_region->ep_mem_size = ffa_emad_size_get(version); + mem_region->ep_mem_offset = sizeof(*mem_region); + memset(mem_region->reserved, 0, 12); + } +} + static int ffa_setup_and_transmit(u32 func_id, void *buffer, u32 max_fragsize, struct ffa_mem_ops_args *args) @@ -667,27 +687,24 @@ ffa_setup_and_transmit(u32 func_id, void *buffer, u32 max_fragsize, mem_region->flags = args->flags; mem_region->sender_id = drv_info->vm_id; mem_region->attributes = ffa_memory_attributes_get(func_id); - ep_mem_access = buffer + - ffa_mem_desc_offset(buffer, 0, drv_info->version); composite_offset = ffa_mem_desc_offset(buffer, args->nattrs, drv_info->version); - for (idx = 0; idx < args->nattrs; idx++, ep_mem_access++) { + for (idx = 0; idx < args->nattrs; idx++) { + ep_mem_access = buffer + + ffa_mem_desc_offset(buffer, idx, drv_info->version); ep_mem_access->receiver = args->attrs[idx].receiver; ep_mem_access->attrs = args->attrs[idx].attrs; ep_mem_access->composite_off = composite_offset; ep_mem_access->flag = 0; ep_mem_access->reserved = 0; + ffa_emad_impdef_value_init(drv_info->version, + ep_mem_access->impdef_val, + args->attrs[idx].impdef_val); } mem_region->handle = 0; mem_region->ep_count = args->nattrs; - if (drv_info->version <= FFA_VERSION_1_0) { - mem_region->ep_mem_size = 0; - } else { - mem_region->ep_mem_size = sizeof(*ep_mem_access); - mem_region->ep_mem_offset = sizeof(*mem_region); - memset(mem_region->reserved, 0, 12); - } + ffa_mem_region_additional_setup(drv_info->version, mem_region); composite = buffer + composite_offset; composite->total_pg_cnt = ffa_get_num_pages_sg(args->sg); diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index cd7ee4df9045..81e603839c4a 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -338,6 +338,7 @@ struct ffa_mem_region_attributes { * an `struct ffa_mem_region_addr_range`. */ u32 composite_off; + u8 impdef_val[16]; u64 reserved; }; @@ -417,15 +418,31 @@ struct ffa_mem_region { #define CONSTITUENTS_OFFSET(x) \ (offsetof(struct ffa_composite_mem_region, constituents[x])) +#define FFA_EMAD_HAS_IMPDEF_FIELD(version) ((version) >= FFA_VERSION_1_2) +#define FFA_MEM_REGION_HAS_EP_MEM_OFFSET(version) ((version) > FFA_VERSION_1_0) + +static inline u32 ffa_emad_size_get(u32 ffa_version) +{ + u32 sz; + struct ffa_mem_region_attributes *ep_mem_access; + + if (FFA_EMAD_HAS_IMPDEF_FIELD(ffa_version)) + sz = sizeof(*ep_mem_access); + else + sz = sizeof(*ep_mem_access) - sizeof(ep_mem_access->impdef_val); + + return sz; +} + static inline u32 ffa_mem_desc_offset(struct ffa_mem_region *buf, int count, u32 ffa_version) { - u32 offset = count * sizeof(struct ffa_mem_region_attributes); + u32 offset = count * ffa_emad_size_get(ffa_version); /* * Earlier to v1.1, the endpoint memory descriptor array started at * offset 32(i.e. offset of ep_mem_offset in the current structure) */ - if (ffa_version <= FFA_VERSION_1_0) + if (!FFA_MEM_REGION_HAS_EP_MEM_OFFSET(ffa_version)) offset += offsetof(struct ffa_mem_region, ep_mem_offset); else offset += sizeof(struct ffa_mem_region); -- cgit v1.2.3 From b83fb1b14c06bdd765903ac852ba20a14e24f227 Mon Sep 17 00:00:00 2001 From: Axel Haslam Date: Mon, 6 Oct 2025 11:25:41 -0300 Subject: spi: offload: Add offset parameter Add an offset parameter that can be passed in the periodic trigger. This is useful for example when ADC drivers implement a separate periodic signal to trigger conversion and need offload to read the result with some delay. While at it, add some documentation to offload periodic trigger parameters. Reviewed-by: David Lechner Signed-off-by: Axel Haslam Signed-off-by: Marcelo Schmitt Link: https://patch.msgid.link/cd315e95c0bd8523f00e91c400abcd6a418e5924.1759760519.git.marcelo.schmitt@analog.com Signed-off-by: Mark Brown --- drivers/spi/spi-offload-trigger-pwm.c | 3 +++ include/linux/spi/offload/types.h | 9 +++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/spi/spi-offload-trigger-pwm.c b/drivers/spi/spi-offload-trigger-pwm.c index 805ed41560df..3e8c19227edb 100644 --- a/drivers/spi/spi-offload-trigger-pwm.c +++ b/drivers/spi/spi-offload-trigger-pwm.c @@ -51,12 +51,14 @@ static int spi_offload_trigger_pwm_validate(struct spi_offload_trigger *trigger, wf.period_length_ns = DIV_ROUND_UP_ULL(NSEC_PER_SEC, periodic->frequency_hz); /* REVISIT: 50% duty-cycle for now - may add config parameter later */ wf.duty_length_ns = wf.period_length_ns / 2; + wf.duty_offset_ns = periodic->offset_ns; ret = pwm_round_waveform_might_sleep(st->pwm, &wf); if (ret < 0) return ret; periodic->frequency_hz = DIV_ROUND_UP_ULL(NSEC_PER_SEC, wf.period_length_ns); + periodic->offset_ns = wf.duty_offset_ns; return 0; } @@ -77,6 +79,7 @@ static int spi_offload_trigger_pwm_enable(struct spi_offload_trigger *trigger, wf.period_length_ns = DIV_ROUND_UP_ULL(NSEC_PER_SEC, periodic->frequency_hz); /* REVISIT: 50% duty-cycle for now - may add config parameter later */ wf.duty_length_ns = wf.period_length_ns / 2; + wf.duty_offset_ns = periodic->offset_ns; return pwm_set_waveform_might_sleep(st->pwm, &wf, false); } diff --git a/include/linux/spi/offload/types.h b/include/linux/spi/offload/types.h index 6f7892347871..cd61f8adb7a5 100644 --- a/include/linux/spi/offload/types.h +++ b/include/linux/spi/offload/types.h @@ -57,8 +57,17 @@ enum spi_offload_trigger_type { SPI_OFFLOAD_TRIGGER_PERIODIC, }; +/** + * spi_offload_trigger_periodic - configuration parameters for periodic triggers + * @frequency_hz: The rate that the trigger should fire in Hz. + * @offset_ns: A delay in nanoseconds between when this trigger fires + * compared to another trigger. This requires specialized hardware + * that supports such synchronization with a delay between two or + * more triggers. Set to 0 when not needed. + */ struct spi_offload_trigger_periodic { u64 frequency_hz; + u64 offset_ns; }; struct spi_offload_trigger_config { -- cgit v1.2.3 From 5a43dc9f4ee0a3624d0598ee14e8ef8468914525 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 13 Oct 2025 23:03:10 +0900 Subject: firewire: core: detect device quirk when reading configuration ROM Every time the bus manager runs, the cached configuration ROM content of the IRM device is investigated to detect device-specific quirks. This detection can be performed in advance when reading the configuration ROM. This commit adds device quirk flags to the fw_device structure, and initializes them after reading the bus information block of the configuration ROM. The quirk flags are immutable once the configuration ROM has been read. Although they are likely accessed concurrently only by the bus manager, this commit ensures safe access by preventing torn writes and reads using the WRITE_ONCE()/READ_ONCE() macros. Link: https://lore.kernel.org/r/20251013140311.97159-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 21 +++++++-------------- drivers/firewire/core-device.c | 25 +++++++++++++++++++++++-- include/linux/firewire.h | 11 +++++++++++ 3 files changed, 41 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index e5e0174a0335..6979d6a88ae2 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -86,8 +86,6 @@ static size_t config_rom_length = 1 + 4 + 1 + 1; */ #define DEFAULT_SPLIT_TIMEOUT (2 * 8000) -#define CANON_OUI 0x000085 - static void generate_config_rom(struct fw_card *card, __be32 *config_rom) { struct fw_descriptor *desc; @@ -308,11 +306,9 @@ __must_hold(&card->lock) cpu_to_be32(local_id), }; bool grace = time_is_before_jiffies64(card->reset_jiffies + msecs_to_jiffies(125)); - bool irm_is_1394_1995_only = false; - bool keep_this_irm = false; struct fw_node *irm_node; struct fw_device *irm_device; - int irm_node_id; + int irm_node_id, irm_device_quirks = 0; int rcode; lockdep_assert_held(&card->lock); @@ -328,15 +324,12 @@ __must_hold(&card->lock) return BM_CONTENTION_OUTCOME_IRM_HAS_LINK_OFF; } + // NOTE: It is likely that the quirk detection for IRM device has not done yet. irm_device = fw_node_get_device(irm_node); - if (irm_device && irm_device->config_rom) { - irm_is_1394_1995_only = (irm_device->config_rom[2] & 0x000000f0) == 0; - - // Canon MV5i works unreliably if it is not root node. - keep_this_irm = irm_device->config_rom[3] >> 8 == CANON_OUI; - } - - if (irm_is_1394_1995_only && !keep_this_irm) { + if (irm_device) + irm_device_quirks = READ_ONCE(irm_device->quirks); + if ((irm_device_quirks & FW_DEVICE_QUIRK_IRM_IS_1394_1995_ONLY) && + !(irm_device_quirks & FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER)) { fw_notice(card, "IRM is not 1394a compliant, making local node (%02x) root\n", local_id); return BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY; @@ -373,7 +366,7 @@ __must_hold(&card->lock) return BM_CONTENTION_OUTCOME_IRM_HOLDS_LOCAL_NODE_AS_BM; } default: - if (!keep_this_irm) { + if (!(irm_device_quirks & FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER)) { fw_notice(card, "BM lock failed (%s), making local node (%02x) root\n", fw_rcode_string(rcode), local_id); return BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY; diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 457a0da024a7..9bab2d594b89 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -542,6 +542,21 @@ static struct device_attribute fw_device_attributes[] = { __ATTR_NULL, }; +#define CANON_OUI 0x000085 + +static int detect_quirks_by_bus_information_block(const u32 *bus_information_block) +{ + int quirks = 0; + + if ((bus_information_block[2] & 0x000000f0) == 0) + quirks |= FW_DEVICE_QUIRK_IRM_IS_1394_1995_ONLY; + + if ((bus_information_block[3] >> 8) == CANON_OUI) + quirks |= FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER; + + return quirks; +} + static int read_rom(struct fw_device *device, int generation, int index, u32 *data) { @@ -582,6 +597,7 @@ static int read_config_rom(struct fw_device *device, int generation) u32 *rom, *stack; u32 sp, key; int i, end, length, ret; + int quirks; rom = kmalloc(sizeof(*rom) * MAX_CONFIG_ROM_SIZE + sizeof(*stack) * MAX_CONFIG_ROM_SIZE, GFP_KERNEL); @@ -612,6 +628,11 @@ static int read_config_rom(struct fw_device *device, int generation) } } + quirks = detect_quirks_by_bus_information_block(rom); + + // Just prevent from torn writing/reading. + WRITE_ONCE(device->quirks, quirks); + device->max_speed = device->node->max_speed; /* @@ -1122,10 +1143,10 @@ static void fw_device_init(struct work_struct *work) device->workfn = fw_device_shutdown; fw_schedule_device_work(device, SHUTDOWN_DELAY); } else { - fw_notice(card, "created device %s: GUID %08x%08x, S%d00\n", + fw_notice(card, "created device %s: GUID %08x%08x, S%d00, quirks %08x\n", dev_name(&device->device), device->config_rom[3], device->config_rom[4], - 1 << device->max_speed); + 1 << device->max_speed, device->quirks); device->config_rom_retries = 0; set_broadcast_channel(device, device->generation); diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 6d208769d456..161829cfcc00 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -170,6 +170,14 @@ struct fw_attribute_group { struct attribute *attrs[13]; }; +enum fw_device_quirk { + // See afa1282a35d3 ("firewire: core: check for 1394a compliant IRM, fix inaccessibility of Sony camcorder"). + FW_DEVICE_QUIRK_IRM_IS_1394_1995_ONLY = BIT(0), + + // See a509e43ff338 ("firewire: core: fix unstable I/O with Canon camcorder"). + FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER = BIT(1), +}; + enum fw_device_state { FW_DEVICE_INITIALIZING, FW_DEVICE_RUNNING, @@ -203,6 +211,9 @@ struct fw_device { struct fw_card *card; struct device device; + // A set of enum fw_device_quirk. + int quirks; + struct mutex client_list_mutex; struct list_head client_list; -- cgit v1.2.3 From 15f9610fc96ac6fd2844e63f7bf5a0b08e1c31c8 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 13 Oct 2025 23:03:11 +0900 Subject: firewire: core: handle device quirk of MOTU Audio Express A commit 3a93d082bacf ("ALSA: firewire-motu: add support for MOTU Audio Express") describes a quirk of MOTU Audio Express. The device returns acknowledge packet with 0x10 as the pending state of any types of asynchronous request transaction. It is completely out of specification. This commit implements handling for that device-specific quirk. The quirk is detected after reading the root directory of configuration ROM. When processing the acknowledge code in 1394 OHCI AT context event handler, firewire-ohci module seeks the device instance of destination node by traversing device hierarchy. If the device has the quirk, the acknowledge code is replaced with the standard code. The 1394 OHCI AT context events occur for outgoing asynchronous request packets. The device traversal is safe since no new request initiators exist after the fw_card_instance has been invalidated. Link: https://lore.kernel.org/r/20251013140311.97159-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-device.c | 53 ++++++++++++++++++++++++++++++++++++++++++ drivers/firewire/ohci.c | 29 +++++++++++++++++++++++ include/linux/firewire.h | 3 +++ 3 files changed, 85 insertions(+) (limited to 'include/linux') diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 9bab2d594b89..33ce4cd357ed 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -557,6 +557,54 @@ static int detect_quirks_by_bus_information_block(const u32 *bus_information_blo return quirks; } +struct entry_match { + unsigned int index; + u32 value; +}; + +static const struct entry_match motu_audio_express_matches[] = { + { 1, 0x030001f2 }, + { 3, 0xd1000002 }, + { 4, 0x8d000005 }, + { 6, 0x120001f2 }, + { 7, 0x13000033 }, + { 8, 0x17104800 }, +}; + +static int detect_quirks_by_root_directory(const u32 *root_directory, unsigned int length) +{ + static const struct { + enum fw_device_quirk quirk; + const struct entry_match *matches; + unsigned int match_count; + } *entry, entries[] = { + { + .quirk = FW_DEVICE_QUIRK_ACK_PACKET_WITH_INVALID_PENDING_CODE, + .matches = motu_audio_express_matches, + .match_count = ARRAY_SIZE(motu_audio_express_matches), + }, + }; + int quirks = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(entries); ++i) { + int j; + + entry = entries + i; + for (j = 0; j < entry->match_count; ++j) { + unsigned int index = entry->matches[j].index; + unsigned int value = entry->matches[j].value; + + if ((length < index) || (root_directory[index] != value)) + break; + } + if (j == entry->match_count) + quirks |= entry->quirk; + } + + return quirks; +} + static int read_rom(struct fw_device *device, int generation, int index, u32 *data) { @@ -737,6 +785,11 @@ static int read_config_rom(struct fw_device *device, int generation) length = i; } + quirks |= detect_quirks_by_root_directory(rom + ROOT_DIR_OFFSET, length - ROOT_DIR_OFFSET); + + // Just prevent from torn writing/reading. + WRITE_ONCE(device->quirks, quirks); + old_rom = device->config_rom; new_rom = kmemdup(rom, length * 4, GFP_KERNEL); if (new_rom == NULL) { diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 030aed5453a1..757dd9c64b1c 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -1319,6 +1319,14 @@ static void at_context_flush(struct at_context *ctx) enable_work(&ctx->work); } +static int find_fw_device(struct device *dev, const void *data) +{ + struct fw_device *device = fw_device(dev); + const u32 *params = data; + + return (device->generation == params[0]) && (device->node_id == params[1]); +} + static int handle_at_packet(struct context *context, struct descriptor *d, struct descriptor *last) @@ -1390,6 +1398,27 @@ static int handle_at_packet(struct context *context, fallthrough; default: + if (unlikely(evt == 0x10)) { + u32 params[2] = { + packet->generation, + async_header_get_destination(packet->header), + }; + struct device *dev; + + fw_card_get(&ohci->card); + dev = device_find_child(ohci->card.device, (const void *)params, find_fw_device); + fw_card_put(&ohci->card); + if (dev) { + struct fw_device *device = fw_device(dev); + int quirks = READ_ONCE(device->quirks); + + put_device(dev); + if (quirks & FW_DEVICE_QUIRK_ACK_PACKET_WITH_INVALID_PENDING_CODE) { + packet->ack = ACK_PENDING; + break; + } + } + } packet->ack = RCODE_SEND_ERROR; break; } diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 161829cfcc00..f1d8734c0ec6 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -176,6 +176,9 @@ enum fw_device_quirk { // See a509e43ff338 ("firewire: core: fix unstable I/O with Canon camcorder"). FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER = BIT(1), + + // MOTU Audio Express transfers acknowledge packet with 0x10 for pending state. + FW_DEVICE_QUIRK_ACK_PACKET_WITH_INVALID_PENDING_CODE = BIT(2), }; enum fw_device_state { -- cgit v1.2.3 From 300709fbefd19ff7293c7d0ded9b56e69216e634 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Fri, 10 Oct 2025 10:51:47 +0200 Subject: mm/memory_hotplug: Remove MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers were introduced to prepare the transition of memory to and from a physically accessible state. This enhancement was crucial for implementing the "memmap on memory" feature for s390. With introduction of dynamic (de)configuration of hotpluggable memory, memory can be brought to accessible state before add_memory(). Memory can be brought to inaccessible state before remove_memory(). Hence, there is no need of MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers anymore. This basically reverts commit c5f1e2d18909 ("mm/memory_hotplug: introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers") Additionally, apply minor adjustments to the function parameters of move_pfn_range_to_zone() and mhp_supports_memmap_on_memory() to ensure compatibility with the latest branch. Acked-by: David Hildenbrand Signed-off-by: Sumanth Korikkar Signed-off-by: Heiko Carstens --- drivers/base/memory.c | 23 +---------------------- include/linux/memory.h | 9 --------- include/linux/memory_hotplug.h | 18 +----------------- include/linux/memremap.h | 1 - mm/memory_hotplug.c | 17 +++-------------- mm/sparse.c | 3 +-- 6 files changed, 6 insertions(+), 65 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 6d84a02cfa5d..fc43f2703ae0 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -226,7 +226,6 @@ static int memory_block_online(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = 0; - struct memory_notify arg; struct zone *zone; int ret; @@ -246,19 +245,9 @@ static int memory_block_online(struct memory_block *mem) if (mem->altmap) nr_vmemmap_pages = mem->altmap->free; - arg.altmap_start_pfn = start_pfn; - arg.altmap_nr_pages = nr_vmemmap_pages; - arg.start_pfn = start_pfn + nr_vmemmap_pages; - arg.nr_pages = nr_pages - nr_vmemmap_pages; mem_hotplug_begin(); - ret = memory_notify(MEM_PREPARE_ONLINE, &arg); - ret = notifier_to_errno(ret); - if (ret) - goto out_notifier; - if (nr_vmemmap_pages) { - ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, - zone, mem->altmap->inaccessible); + ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); if (ret) goto out; } @@ -280,11 +269,7 @@ static int memory_block_online(struct memory_block *mem) nr_vmemmap_pages); mem->zone = zone; - mem_hotplug_done(); - return ret; out: - memory_notify(MEM_FINISH_OFFLINE, &arg); -out_notifier: mem_hotplug_done(); return ret; } @@ -297,7 +282,6 @@ static int memory_block_offline(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = 0; - struct memory_notify arg; int ret; if (!mem->zone) @@ -329,11 +313,6 @@ static int memory_block_offline(struct memory_block *mem) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); mem->zone = NULL; - arg.altmap_start_pfn = start_pfn; - arg.altmap_nr_pages = nr_vmemmap_pages; - arg.start_pfn = start_pfn + nr_vmemmap_pages; - arg.nr_pages = nr_pages - nr_vmemmap_pages; - memory_notify(MEM_FINISH_OFFLINE, &arg); out: mem_hotplug_done(); return ret; diff --git a/include/linux/memory.h b/include/linux/memory.h index 0c214256216f..ba1515160894 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,17 +96,8 @@ int set_memory_block_size_order(unsigned int order); #define MEM_GOING_ONLINE (1<<3) #define MEM_CANCEL_ONLINE (1<<4) #define MEM_CANCEL_OFFLINE (1<<5) -#define MEM_PREPARE_ONLINE (1<<6) -#define MEM_FINISH_OFFLINE (1<<7) struct memory_notify { - /* - * The altmap_start_pfn and altmap_nr_pages fields are designated for - * specifying the altmap range and are exclusively intended for use in - * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. - */ - unsigned long altmap_start_pfn; - unsigned long altmap_nr_pages; unsigned long start_pfn; unsigned long nr_pages; }; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 23f038a16231..f2f16cdd73ee 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -58,22 +58,6 @@ typedef int __bitwise mhp_t; * implies the node id (nid). */ #define MHP_NID_IS_MGID ((__force mhp_t)BIT(2)) -/* - * The hotplugged memory is completely inaccessible while the memory is - * offline. The memory provider will handle MEM_PREPARE_ONLINE / - * MEM_FINISH_OFFLINE notifications and make the memory accessible. - * - * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY, - * because the altmap cannot be written (e.g., poisoned) when adding - * memory -- before it is set online. - * - * This allows for adding memory with an altmap that is not currently - * made available by a hypervisor. When onlining that memory, the - * hypervisor can be instructed to make that memory available, and - * the onlining phase will not require any memory allocations, which is - * helpful in low-memory situations. - */ -#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3)) /* * Extended parameters for memory hotplug: @@ -123,7 +107,7 @@ extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone, bool mhp_off_inaccessible); + struct zone *zone); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e5951ba12a28..30c7aecbd245 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -25,7 +25,6 @@ struct vmem_altmap { unsigned long free; unsigned long align; unsigned long alloc; - bool inaccessible; }; /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0be83039c3b5..238a6712738e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1088,7 +1088,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone, bool mhp_off_inaccessible) + struct zone *zone) { unsigned long end_pfn = pfn + nr_pages; int ret, i; @@ -1097,15 +1097,6 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (ret) return ret; - /* - * Memory block is accessible at this stage and hence poison the struct - * pages now. If the memory block is accessible during memory hotplug - * addition phase, then page poisining is already performed in - * sparse_add_section(). - */ - if (mhp_off_inaccessible) - page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE, false); @@ -1444,7 +1435,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size) } static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, - u64 start, u64 size, mhp_t mhp_flags) + u64 start, u64 size) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1460,8 +1451,6 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, }; mhp_altmap.free = memory_block_memmap_on_memory_pages(); - if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) - mhp_altmap.inaccessible = true; params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), GFP_KERNEL); if (!params.altmap) { @@ -1555,7 +1544,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && mhp_supports_memmap_on_memory()) { - ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); + ret = create_altmaps_and_memory_blocks(nid, group, start, size); if (ret) goto error; } else { diff --git a/mm/sparse.c b/mm/sparse.c index 17c50a6415c2..b5b2b6f7041b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -951,8 +951,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - if (!altmap || !altmap->inaccessible) - page_init_poison(memmap, sizeof(struct page) * nr_pages); + page_init_poison(memmap, sizeof(struct page) * nr_pages); ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); -- cgit v1.2.3 From 347ed2d566dabb06c7970fff01129c4f59995ed6 Mon Sep 17 00:00:00 2001 From: zhidao su Date: Sat, 11 Oct 2025 15:16:51 +0800 Subject: sched/ext: Implement cgroup_set_idle() callback Implement the missing cgroup_set_idle() callback that was marked as a TODO. This allows BPF schedulers to be notified when a cgroup's idle state changes, enabling them to adjust their scheduling behavior accordingly. The implementation follows the same pattern as other cgroup callbacks like cgroup_set_weight() and cgroup_set_bandwidth(). It checks if the BPF scheduler has implemented the callback and invokes it with the appropriate parameters. Fixes a spelling error in the cgroup_set_bandwidth() documentation. tj: s/scx_cgroup_rwsem/scx_cgroup_ops_rwsem/ to fix build breakage. Signed-off-by: zhidao su Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 16 +++++++++++++++- kernel/sched/ext_internal.h | 13 ++++++++++++- 3 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index d82b7a9b0658..9848aeab2786 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -228,6 +228,7 @@ struct scx_task_group { u64 bw_period_us; u64 bw_quota_us; u64 bw_burst_us; + bool idle; #endif }; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4b1467d3541a..430749ce46ab 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3066,6 +3066,7 @@ void scx_tg_init(struct task_group *tg) tg->scx.weight = CGROUP_WEIGHT_DFL; tg->scx.bw_period_us = default_bw_period_us(); tg->scx.bw_quota_us = RUNTIME_INF; + tg->scx.idle = false; } int scx_tg_online(struct task_group *tg) @@ -3214,7 +3215,18 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - /* TODO: Implement ops->cgroup_set_idle() */ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_ops_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL, + tg_cgrp(tg), idle); + + /* Update the task group's idle state */ + tg->scx.idle = idle; + + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_bandwidth(struct task_group *tg, @@ -5017,6 +5029,7 @@ static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *fro static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} +static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} #endif static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} @@ -5055,6 +5068,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, + .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, #endif .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index b3617abed510..7d00a0a2456e 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -697,12 +697,23 @@ struct sched_ext_ops { * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be * interpreted in the same fashion and specifies how much @cgrp can * burst temporarily. The specific control mechanism and thus the - * interpretation of @period_us and burstiness is upto to the BPF + * interpretation of @period_us and burstiness is up to the BPF * scheduler. */ void (*cgroup_set_bandwidth)(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us); + /** + * @cgroup_set_idle: A cgroup's idle state is being changed + * @cgrp: cgroup whose idle state is being updated + * @idle: whether the cgroup is entering or exiting idle state + * + * Update @cgrp's idle state to @idle. This callback is invoked when + * a cgroup transitions between idle and non-idle states, allowing the + * BPF scheduler to adjust its behavior accordingly. + */ + void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle); + #endif /* CONFIG_EXT_GROUP_SCHED */ /* -- cgit v1.2.3 From afb026b6d35c79f6f47752147327932827aeac8c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:13 -0700 Subject: compiler: Tweak __UNIQUE_ID() naming In preparation for the objtool klp diff subcommand, add an underscore between the name and the counter. This will make it possible for objtool to distinguish between the non-unique and unique parts of the symbol name so it can properly correlate the symbols. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/compiler.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5b45ea7dff3e..6a32250f22f7 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -163,7 +163,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __asm__ ("" : "=r" (var) : "0" (var)) #endif -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) +/* Format: __UNIQUE_ID__<__COUNTER__> */ +#define __UNIQUE_ID(name) \ + __PASTE(__UNIQUE_ID_, \ + __PASTE(name, \ + __PASTE(_, __COUNTER__))) /** * data_race - mark an expression as containing intentional data races -- cgit v1.2.3 From 9f14f1f91883aa2bfd6663161d2002c8ce937c43 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:14 -0700 Subject: compiler.h: Make addressable symbols less of an eyesore Avoid underscore overload by changing: __UNIQUE_ID___addressable_loops_per_jiffy_868 to the following: __UNIQUE_ID_addressable_loops_per_jiffy_868 This matches the format used by other __UNIQUE_ID()-generated symbols and improves readability for those who stare at ELF symbol table dumps. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 6a32250f22f7..ab181d87d71d 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -287,7 +287,7 @@ static inline void *offset_to_ptr(const int *off) */ #define ___ADDRESSABLE(sym, __attrs) \ static void * __used __attrs \ - __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + __UNIQUE_ID(__PASTE(addressable_, sym)) = (void *)(uintptr_t)&sym; #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -- cgit v1.2.3 From c2d420796a427dda71a2400909864e7f8e037fd4 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:15 -0700 Subject: elfnote: Change ELFNOTE() to use __UNIQUE_ID() In preparation for the objtool klp diff subcommand, replace the custom unique symbol name generation in ELFNOTE() with __UNIQUE_ID(). This standardizes the naming format for all "unique" symbols, which will allow objtool to properly correlate them. Note this also removes the "one ELF note per line" limitation. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/elfnote.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 69b136e4dd2b..bb3dcded055f 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -60,23 +60,21 @@ #else /* !__ASSEMBLER__ */ #include +#include /* * Use an anonymous structure which matches the shape of * Elf{32,64}_Nhdr, but includes the name and desc data. The size and * type of name and desc depend on the macro arguments. "name" must - * be a literal string, and "desc" must be passed by value. You may - * only define one note per line, since __LINE__ is used to generate - * unique symbols. + * be a literal string, and "desc" must be passed by value. */ -#define _ELFNOTE_PASTE(a,b) a##b -#define _ELFNOTE(size, name, unique, type, desc) \ +#define ELFNOTE(size, name, type, desc) \ static const struct { \ struct elf##size##_note _nhdr; \ unsigned char _name[sizeof(name)] \ __attribute__((aligned(sizeof(Elf##size##_Word)))); \ typeof(desc) _desc \ __attribute__((aligned(sizeof(Elf##size##_Word)))); \ - } _ELFNOTE_PASTE(_note_, unique) \ + } __UNIQUE_ID(note) \ __used \ __attribute__((section(".note." name), \ aligned(sizeof(Elf##size##_Word)), \ @@ -89,11 +87,10 @@ name, \ desc \ } -#define ELFNOTE(size, name, type, desc) \ - _ELFNOTE(size, name, __LINE__, type, desc) #define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc) #define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc) + #endif /* __ASSEMBLER__ */ #endif /* _LINUX_ELFNOTE_H */ -- cgit v1.2.3 From 6717e8f91db71641cb52855ed14c7900972ed0bc Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:16 -0700 Subject: kbuild: Remove 'kmod_' prefix from __KBUILD_MODNAME In preparation for the objtool klp diff subcommand, remove the arbitrary 'kmod_' prefix from __KBUILD_MODNAME and instead add it explicitly in the __initcall_id() macro. This change supports the standardization of "unique" symbol naming by ensuring the non-unique portion of the name comes before the unique part. That will enable objtool to properly correlate symbols across builds. Cc: Masahiro Yamada Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/init.h | 3 ++- scripts/Makefile.lib | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index 17c1bc712e23..40331923b9f4 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -200,12 +200,13 @@ extern struct module __this_module; /* Format: ____ */ #define __initcall_id(fn) \ + __PASTE(kmod_, \ __PASTE(__KBUILD_MODNAME, \ __PASTE(__, \ __PASTE(__COUNTER__, \ __PASTE(_, \ __PASTE(__LINE__, \ - __PASTE(_, fn)))))) + __PASTE(_, fn))))))) /* Format: ____ */ #define __initcall_name(prefix, __iid, id) \ diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 1d581ba5df66..b95560266124 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -20,7 +20,7 @@ name-fix-token = $(subst $(comma),_,$(subst -,_,$1)) name-fix = $(call stringify,$(call name-fix-token,$1)) basename_flags = -DKBUILD_BASENAME=$(call name-fix,$(basetarget)) modname_flags = -DKBUILD_MODNAME=$(call name-fix,$(modname)) \ - -D__KBUILD_MODNAME=kmod_$(call name-fix-token,$(modname)) + -D__KBUILD_MODNAME=$(call name-fix-token,$(modname)) modfile_flags = -DKBUILD_MODFILE=$(call stringify,$(modfile)) _c_flags = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), \ -- cgit v1.2.3 From b37491d72b43c3a322d396c2d8e951a10be70c17 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Sep 2025 09:30:03 -0700 Subject: interval_tree: Fix ITSTATIC usage for *_subtree_search() For consistency with the other function templates, change _subtree_search_*() to use the user-supplied ITSTATIC rather than the hard-coded 'static'. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h | 4 ++++ include/linux/interval_tree.h | 4 ++++ include/linux/interval_tree_generic.h | 2 +- include/linux/mm.h | 2 ++ lib/interval_tree.c | 1 + tools/include/linux/interval_tree_generic.h | 2 +- 6 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h index 1d7fc3226bca..cfb42a8f5768 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h @@ -53,6 +53,10 @@ extern void usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node, struct rb_root_cached *root); extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_subtree_search(struct usnic_uiom_interval_node *node, + unsigned long start, + unsigned long last); +extern struct usnic_uiom_interval_node * usnic_uiom_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h index 2b8026a39906..9d5791e9f737 100644 --- a/include/linux/interval_tree.h +++ b/include/linux/interval_tree.h @@ -19,6 +19,10 @@ extern void interval_tree_remove(struct interval_tree_node *node, struct rb_root_cached *root); +extern struct interval_tree_node * +interval_tree_subtree_search(struct interval_tree_node *node, + unsigned long start, unsigned long last); + extern struct interval_tree_node * interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index 1b400f26f63d..c5a2fed49eb0 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \ * Cond2: start <= ITLAST(node) \ */ \ \ -static ITSTRUCT * \ +ITSTATIC ITSTRUCT * \ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ { \ while (true) { \ diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..04fa27718cd1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3369,6 +3369,8 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root_cached *root); +struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node, + unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, diff --git a/lib/interval_tree.c b/lib/interval_tree.c index 324766e9bf63..9ceb084b6b4e 100644 --- a/lib/interval_tree.c +++ b/lib/interval_tree.c @@ -13,6 +13,7 @@ INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, EXPORT_SYMBOL_GPL(interval_tree_insert); EXPORT_SYMBOL_GPL(interval_tree_remove); +EXPORT_SYMBOL_GPL(interval_tree_subtree_search); EXPORT_SYMBOL_GPL(interval_tree_iter_first); EXPORT_SYMBOL_GPL(interval_tree_iter_next); diff --git a/tools/include/linux/interval_tree_generic.h b/tools/include/linux/interval_tree_generic.h index 1b400f26f63d..c5a2fed49eb0 100644 --- a/tools/include/linux/interval_tree_generic.h +++ b/tools/include/linux/interval_tree_generic.h @@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \ * Cond2: start <= ITLAST(node) \ */ \ \ -static ITSTRUCT * \ +ITSTATIC ITSTRUCT * \ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ { \ while (true) { \ -- cgit v1.2.3 From d2c60bde1c0fcac8b140e527546f80749ccd9c67 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:53 -0700 Subject: objtool: Move ANNOTATE* macros to annotate.h In preparation for using the objtool annotation macros in higher-level objtool.h macros like UNWIND_HINT, move them to their own file. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/annotate.h | 109 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/objtool.h | 90 +------------------------------------- 2 files changed, 110 insertions(+), 89 deletions(-) create mode 100644 include/linux/annotate.h (limited to 'include/linux') diff --git a/include/linux/annotate.h b/include/linux/annotate.h new file mode 100644 index 000000000000..ccb445496331 --- /dev/null +++ b/include/linux/annotate.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ANNOTATE_H +#define _LINUX_ANNOTATE_H + +#include + +#ifdef CONFIG_OBJTOOL + +#ifndef __ASSEMBLY__ + +#define __ASM_ANNOTATE(label, type) \ + ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ + ".long " __stringify(label) " - .\n\t" \ + ".long " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define ASM_ANNOTATE(type) \ + "911:\n\t" \ + __ASM_ANNOTATE(911b, type) + +#else /* __ASSEMBLY__ */ + +.macro ANNOTATE type:req +.Lhere_\@: + .pushsection .discard.annotate_insn,"M",@progbits,8 + .long .Lhere_\@ - . + .long \type + .popsection +.endm + +#endif /* __ASSEMBLY__ */ + +#else /* !CONFIG_OBJTOOL */ +#ifndef __ASSEMBLY__ +#define __ASM_ANNOTATE(label, type) "" +#define ASM_ANNOTATE(type) +#else /* __ASSEMBLY__ */ +.macro ANNOTATE type:req +.endm +#endif /* __ASSEMBLY__ */ +#endif /* !CONFIG_OBJTOOL */ + +#ifndef __ASSEMBLY__ + +/* + * Annotate away the various 'relocation to !ENDBR` complaints; knowing that + * these relocations will never be used for indirect calls. + */ +#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) + +/* + * This should be used immediately before an indirect jump/call. It tells + * objtool the subsequent indirect jump/call is vouched safe for retpoline + * builds. + */ +#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) +/* + * See linux/instrumentation.h + */ +#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) +#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) +/* + * Use objtool to validate the entry requirement that all code paths do + * VALIDATE_UNRET_END before RET. + * + * NOTE: The macro must be used at the beginning of a global symbol, otherwise + * it will be ignored. + */ +#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) +/* + * This should be used to refer to an instruction that is considered + * terminating, like a noreturn CALL or UD2 when we know they are not -- eg + * WARN using UD2. + */ +#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) +/* + * This should not be used; it annotates away CFI violations. There are a few + * valid use cases like kexec handover to the next kernel image, and there is + * no security concern there. + * + * There are also a few real issues annotated away, like EFI because we can't + * control the EFI code. + */ +#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI)) + +#else /* __ASSEMBLY__ */ +#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR +#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE +/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ +/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ +#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS +#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL +#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE +#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI +#endif /* __ASSEMBLY__ */ + +#endif /* _LINUX_ANNOTATE_H */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 46ebaa46e6c5..1973e9f14bf9 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -3,11 +3,10 @@ #define _LINUX_OBJTOOL_H #include +#include #ifdef CONFIG_OBJTOOL -#include - #ifndef __ASSEMBLY__ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ @@ -53,16 +52,6 @@ #define __ASM_BREF(label) label ## b -#define __ASM_ANNOTATE(label, type) \ - ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ - ".long " __stringify(label) " - .\n\t" \ - ".long " __stringify(type) "\n\t" \ - ".popsection\n\t" - -#define ASM_ANNOTATE(type) \ - "911:\n\t" \ - __ASM_ANNOTATE(911b, type) - #else /* __ASSEMBLY__ */ /* @@ -111,14 +100,6 @@ #endif .endm -.macro ANNOTATE type:req -.Lhere_\@: - .pushsection .discard.annotate_insn,"M",@progbits,8 - .long .Lhere_\@ - . - .long \type - .popsection -.endm - #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ @@ -128,84 +109,15 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) -#define __ASM_ANNOTATE(label, type) "" -#define ASM_ANNOTATE(type) #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm -.macro ANNOTATE type:req -.endm #endif #endif /* CONFIG_OBJTOOL */ -#ifndef __ASSEMBLY__ -/* - * Annotate away the various 'relocation to !ENDBR` complaints; knowing that - * these relocations will never be used for indirect calls. - */ -#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) -#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) - -/* - * This should be used immediately before an indirect jump/call. It tells - * objtool the subsequent indirect jump/call is vouched safe for retpoline - * builds. - */ -#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) -/* - * See linux/instrumentation.h - */ -#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) -#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) -/* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). - */ -#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) -/* - * Use objtool to validate the entry requirement that all code paths do - * VALIDATE_UNRET_END before RET. - * - * NOTE: The macro must be used at the beginning of a global symbol, otherwise - * it will be ignored. - */ -#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) -/* - * This should be used to refer to an instruction that is considered - * terminating, like a noreturn CALL or UD2 when we know they are not -- eg - * WARN using UD2. - */ -#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) -/* - * This should not be used; it annotates away CFI violations. There are a few - * valid use cases like kexec handover to the next kernel image, and there is - * no security concern there. - * - * There are also a few real issues annotated away, like EFI because we can't - * control the EFI code. - */ -#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI)) - -#else -#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR -#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE -/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ -/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ -#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS -#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL -#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN -#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE -#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI -#endif - #if defined(CONFIG_NOINSTR_VALIDATION) && \ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) #define VALIDATE_UNRET_BEGIN ANNOTATE_UNRET_BEGIN -- cgit v1.2.3 From 58f36a5756445dcd0a733504cd798955ebe968c1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:54 -0700 Subject: objtool: Add ANNOTATE_DATA_SPECIAL In preparation for the objtool klp diff subcommand, add an ANNOTATE_DATA_SPECIAL macro which annotates special section entries so that objtool can determine their size and location and extract them when needed. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/annotate.h | 49 ++++++++++++++++++++++++++++--------- include/linux/objtool_types.h | 2 ++ tools/include/linux/objtool_types.h | 2 ++ 3 files changed, 41 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/annotate.h b/include/linux/annotate.h index ccb445496331..7c10d34d198c 100644 --- a/include/linux/annotate.h +++ b/include/linux/annotate.h @@ -8,34 +8,52 @@ #ifndef __ASSEMBLY__ -#define __ASM_ANNOTATE(label, type) \ - ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ +#define __ASM_ANNOTATE(section, label, type) \ + ".pushsection " section ",\"M\", @progbits, 8\n\t" \ ".long " __stringify(label) " - .\n\t" \ ".long " __stringify(type) "\n\t" \ ".popsection\n\t" +#define ASM_ANNOTATE_LABEL(label, type) \ + __ASM_ANNOTATE(".discard.annotate_insn", label, type) + #define ASM_ANNOTATE(type) \ "911:\n\t" \ - __ASM_ANNOTATE(911b, type) + ASM_ANNOTATE_LABEL(911b, type) + +#define ASM_ANNOTATE_DATA(type) \ + "912:\n\t" \ + __ASM_ANNOTATE(".discard.annotate_data", 912b, type) #else /* __ASSEMBLY__ */ -.macro ANNOTATE type:req +.macro __ANNOTATE section, type .Lhere_\@: - .pushsection .discard.annotate_insn,"M",@progbits,8 + .pushsection \section, "M", @progbits, 8 .long .Lhere_\@ - . .long \type .popsection .endm +.macro ANNOTATE type + __ANNOTATE ".discard.annotate_insn", \type +.endm + +.macro ANNOTATE_DATA type + __ANNOTATE ".discard.annotate_data", \type +.endm + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ #ifndef __ASSEMBLY__ -#define __ASM_ANNOTATE(label, type) "" +#define ASM_ANNOTATE_LABEL(label, type) "" #define ASM_ANNOTATE(type) +#define ASM_ANNOTATE_DATA(type) #else /* __ASSEMBLY__ */ -.macro ANNOTATE type:req +.macro ANNOTATE type +.endm +.macro ANNOTATE_DATA type .endm #endif /* __ASSEMBLY__ */ #endif /* !CONFIG_OBJTOOL */ @@ -47,7 +65,7 @@ * these relocations will never be used for indirect calls. */ #define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) -#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) +#define ANNOTATE_NOENDBR_SYM(sym) asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOENDBR)) /* * This should be used immediately before an indirect jump/call. It tells @@ -58,8 +76,8 @@ /* * See linux/instrumentation.h */ -#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) -#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) +#define ANNOTATE_INSTR_BEGIN(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_BEGIN) +#define ANNOTATE_INSTR_END(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_END) /* * objtool annotation to ignore the alternatives and only consider the original * instruction(s). @@ -83,7 +101,7 @@ * terminating, like a noreturn CALL or UD2 when we know they are not -- eg * WARN using UD2. */ -#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) +#define ANNOTATE_REACHABLE(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_REACHABLE) /* * This should not be used; it annotates away CFI violations. There are a few * valid use cases like kexec handover to the next kernel image, and there is @@ -92,7 +110,13 @@ * There are also a few real issues annotated away, like EFI because we can't * control the EFI code. */ -#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI)) +#define ANNOTATE_NOCFI_SYM(sym) asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOCFI)) + +/* + * Annotate a special section entry. This emables livepatch module generation + * to find and extract individual special section entries as needed. + */ +#define ANNOTATE_DATA_SPECIAL ASM_ANNOTATE_DATA(ANNOTYPE_DATA_SPECIAL) #else /* __ASSEMBLY__ */ #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR @@ -104,6 +128,7 @@ #define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN #define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE #define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI +#define ANNOTATE_DATA_SPECIAL ANNOTATE_DATA type=ANNOTYPE_DATA_SPECIAL #endif /* __ASSEMBLY__ */ #endif /* _LINUX_ANNOTATE_H */ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index aceac94632c8..c6def4049b1a 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -67,4 +67,6 @@ struct unwind_hint { #define ANNOTYPE_REACHABLE 8 #define ANNOTYPE_NOCFI 9 +#define ANNOTYPE_DATA_SPECIAL 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index aceac94632c8..c6def4049b1a 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -67,4 +67,6 @@ struct unwind_hint { #define ANNOTYPE_REACHABLE 8 #define ANNOTYPE_NOCFI 9 +#define ANNOTYPE_DATA_SPECIAL 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From aca282ab7e75dd3c1d14230146357a03bef12194 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:55 -0700 Subject: x86/asm: Annotate special section entries In preparation for the objtool klp diff subcommand, add annotations for special section entries. This will enable objtool to determine the size and location of the entries and to extract them when needed. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- arch/x86/include/asm/alternative.h | 4 ++++ arch/x86/include/asm/asm.h | 5 +++++ arch/x86/include/asm/bug.h | 1 + arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/jump_label.h | 1 + include/linux/objtool.h | 4 +++- 6 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 15bc07a5ebb3..b14c045679e1 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -198,6 +198,7 @@ static inline int alternatives_text_reserved(void *start, void *end) #define ALTINSTR_ENTRY(ft_flags) \ ".pushsection .altinstructions,\"a\"\n" \ + ANNOTATE_DATA_SPECIAL \ " .long 771b - .\n" /* label */ \ " .long 774f - .\n" /* new instruction */ \ " .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \ @@ -207,6 +208,7 @@ static inline int alternatives_text_reserved(void *start, void *end) #define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \ ".pushsection .altinstr_replacement, \"ax\"\n" \ + ANNOTATE_DATA_SPECIAL \ "# ALT: replacement\n" \ "774:\n\t" newinstr "\n775:\n" \ ".popsection\n" @@ -337,6 +339,7 @@ void nop_func(void); * instruction. See apply_alternatives(). */ .macro altinstr_entry orig alt ft_flags orig_len alt_len + ANNOTATE_DATA_SPECIAL .long \orig - . .long \alt - . .4byte \ft_flags @@ -365,6 +368,7 @@ void nop_func(void); .popsection ; \ .pushsection .altinstr_replacement,"ax" ; \ 743: \ + ANNOTATE_DATA_SPECIAL ; \ newinst ; \ 744: \ .popsection ; diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index d5c8d3afe196..bd62bd87a841 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_ASM_H #define _ASM_X86_ASM_H +#include + #ifdef __ASSEMBLER__ # define __ASM_FORM(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ @@ -132,6 +134,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # define _ASM_EXTABLE_TYPE(from, to, type) \ .pushsection "__ex_table","a" ; \ .balign 4 ; \ + ANNOTATE_DATA_SPECIAL ; \ .long (from) - . ; \ .long (to) - . ; \ .long type ; \ @@ -179,6 +182,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # define _ASM_EXTABLE_TYPE(from, to, type) \ " .pushsection \"__ex_table\",\"a\"\n" \ " .balign 4\n" \ + ANNOTATE_DATA_SPECIAL \ " .long (" #from ") - .\n" \ " .long (" #to ") - .\n" \ " .long " __stringify(type) " \n" \ @@ -187,6 +191,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \ " .pushsection \"__ex_table\",\"a\"\n" \ " .balign 4\n" \ + ANNOTATE_DATA_SPECIAL \ " .long (" #from ") - .\n" \ " .long (" #to ") - .\n" \ DEFINE_EXTABLE_TYPE_REG \ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 880ca15073ed..372f4018880c 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -57,6 +57,7 @@ #define _BUG_FLAGS_ASM(ins, file, line, flags, size, extra) \ "1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ + ANNOTATE_DATA_SPECIAL \ __BUG_ENTRY(file, line, flags) \ "\t.org 2b + " size "\n" \ ".popsection\n" \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 893cbca37fe9..fc5f32d4da6e 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -101,6 +101,7 @@ static __always_inline bool _static_cpu_has(u16 bit) asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" + ANNOTATE_DATA_SPECIAL " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 61dd1dee7812..e0a6930a4029 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -15,6 +15,7 @@ #define JUMP_TABLE_ENTRY(key, label) \ ".pushsection __jump_table, \"aw\" \n\t" \ _ASM_ALIGN "\n\t" \ + ANNOTATE_DATA_SPECIAL \ ".long 1b - . \n\t" \ ".long " label " - . \n\t" \ _ASM_PTR " " key " - . \n\t" \ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 1973e9f14bf9..4fea6a042b28 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -9,9 +9,10 @@ #ifndef __ASSEMBLY__ -#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ +#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ "987: \n\t" \ ".pushsection .discard.unwind_hints\n\t" \ + ANNOTATE_DATA_SPECIAL \ /* struct unwind_hint */ \ ".long 987b - .\n\t" \ ".short " __stringify(sp_offset) "\n\t" \ @@ -78,6 +79,7 @@ .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .Lhere_\@: .pushsection .discard.unwind_hints + ANNOTATE_DATA_SPECIAL /* struct unwind_hint */ .long .Lhere_\@ - . .short \sp_offset -- cgit v1.2.3 From f6b740ef5f4724f95363ac0d664e88d221343fa1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:56 -0700 Subject: objtool: Unify STACK_FRAME_NON_STANDARD entry sizes The C implementation of STACK_FRAME_NON_STANDARD emits 8-byte entries, whereas the asm version's entries are only 4 bytes. Make them consistent by converting the asm version to 8-byte entries. This is much easier than converting the C version to 4-bytes, which would require awkwardly putting inline asm in a dummy function in order to pass the 'func' pointer to the asm. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/objtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 4fea6a042b28..b18ab53561c9 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -92,7 +92,7 @@ .macro STACK_FRAME_NON_STANDARD func:req .pushsection .discard.func_stack_frame_non_standard, "aw" - .long \func - . + .quad \func .popsection .endm -- cgit v1.2.3 From dd590d4d57ebeeb826823c288741f2ed20f452af Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:59 -0700 Subject: objtool/klp: Introduce klp diff subcommand for diffing object files Add a new klp diff subcommand which performs a binary diff between two object files and extracts changed functions into a new object which can then be linked into a livepatch module. This builds on concepts from the longstanding out-of-tree kpatch [1] project which began in 2012 and has been used for many years to generate livepatch modules for production kernels. However, this is a complete rewrite which incorporates hard-earned lessons from 12+ years of maintaining kpatch. Key improvements compared to kpatch-build: - Integrated with objtool: Leverages objtool's existing control-flow graph analysis to help detect changed functions. - Works on vmlinux.o: Supports late-linked objects, making it compatible with LTO, IBT, and similar. - Simplified code base: ~3k fewer lines of code. - Upstream: No more out-of-tree #ifdef hacks, far less cruft. - Cleaner internals: Vastly simplified logic for symbol/section/reloc inclusion and special section extraction. - Robust __LINE__ macro handling: Avoids false positive binary diffs caused by the __LINE__ macro by introducing a fix-patch-lines script (coming in a later patch) which injects #line directives into the source .patch to preserve the original line numbers at compile time. Note the end result of this subcommand is not yet functionally complete. Livepatch needs some ELF magic which linkers don't like: - Two relocation sections (.rela*, .klp.rela*) for the same text section. - Use of SHN_LIVEPATCH to mark livepatch symbols. Unfortunately linkers tend to mangle such things. To work around that, klp diff generates a linker-compliant intermediate binary which encodes the relevant KLP section/reloc/symbol metadata. After module linking, a klp post-link step (coming soon) will clean up the mess and convert the linked .ko into a fully compliant livepatch module. Note this subcommand requires the diffed binaries to have been compiled with -ffunction-sections and -fdata-sections, and processed with 'objtool --checksum'. Those constraints will be handled by a klp-build script introduced in a later patch. Without '-ffunction-sections -fdata-sections', reliable object diffing would be infeasible due to toolchain limitations: - For intra-file+intra-section references, the compiler might occasionally generated hard-coded instruction offsets instead of relocations. - Section-symbol-based references can be ambiguous: - Overlapping or zero-length symbols create ambiguity as to which symbol is being referenced. - A reference to the end of a symbol (e.g., checking array bounds) can be misinterpreted as a reference to the next symbol, or vice versa. A potential future alternative to '-ffunction-sections -fdata-sections' would be to introduce a toolchain option that forces symbol-based (non-section) relocations. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- MAINTAINERS | 2 +- include/linux/livepatch.h | 25 +- include/linux/livepatch_external.h | 76 ++ kernel/livepatch/core.c | 4 +- scripts/module.lds.S | 10 +- tools/include/linux/livepatch_external.h | 76 ++ tools/include/linux/string.h | 14 + tools/objtool/Build | 4 +- tools/objtool/Makefile | 3 +- tools/objtool/arch/x86/decode.c | 40 + tools/objtool/builtin-klp.c | 52 + tools/objtool/check.c | 14 - tools/objtool/elf.c | 21 +- tools/objtool/include/objtool/arch.h | 1 + tools/objtool/include/objtool/builtin.h | 2 + tools/objtool/include/objtool/elf.h | 56 +- tools/objtool/include/objtool/klp.h | 31 + tools/objtool/include/objtool/objtool.h | 2 + tools/objtool/include/objtool/util.h | 19 + tools/objtool/klp-diff.c | 1646 ++++++++++++++++++++++++++++++ tools/objtool/objtool.c | 41 +- tools/objtool/sync-check.sh | 1 + tools/objtool/weak.c | 7 + 23 files changed, 2088 insertions(+), 59 deletions(-) create mode 100644 include/linux/livepatch_external.h create mode 100644 tools/include/linux/livepatch_external.h create mode 100644 tools/objtool/builtin-klp.c create mode 100644 tools/objtool/include/objtool/klp.h create mode 100644 tools/objtool/include/objtool/util.h create mode 100644 tools/objtool/klp-diff.c (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 46126ce2f968..755e2528f839 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14439,7 +14439,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching.g F: Documentation/ABI/testing/sysfs-kernel-livepatch F: Documentation/livepatch/ F: arch/powerpc/include/asm/livepatch.h -F: include/linux/livepatch.h +F: include/linux/livepatch*.h F: kernel/livepatch/ F: kernel/module/livepatch.c F: samples/livepatch/ diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 51a258c24ff5..772919e8096a 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_LIVEPATCH) @@ -77,30 +78,6 @@ struct klp_func { bool transition; }; -struct klp_object; - -/** - * struct klp_callbacks - pre/post live-(un)patch callback structure - * @pre_patch: executed before code patching - * @post_patch: executed after code patching - * @pre_unpatch: executed before code unpatching - * @post_unpatch: executed after code unpatching - * @post_unpatch_enabled: flag indicating if post-unpatch callback - * should run - * - * All callbacks are optional. Only the pre-patch callback, if provided, - * will be unconditionally executed. If the parent klp_object fails to - * patch for any reason, including a non-zero error status returned from - * the pre-patch callback, no further callbacks will be executed. - */ -struct klp_callbacks { - int (*pre_patch)(struct klp_object *obj); - void (*post_patch)(struct klp_object *obj); - void (*pre_unpatch)(struct klp_object *obj); - void (*post_unpatch)(struct klp_object *obj); - bool post_unpatch_enabled; -}; - /** * struct klp_object - kernel object structure for live patching * @name: module name (or NULL for vmlinux) diff --git a/include/linux/livepatch_external.h b/include/linux/livepatch_external.h new file mode 100644 index 000000000000..138af19b0f5c --- /dev/null +++ b/include/linux/livepatch_external.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * External livepatch interfaces for patch creation tooling + */ + +#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_ +#define _LINUX_LIVEPATCH_EXTERNAL_H_ + +#include + +#define KLP_RELOC_SEC_PREFIX ".klp.rela." +#define KLP_SYM_PREFIX ".klp.sym." + +#define __KLP_PRE_PATCH_PREFIX __klp_pre_patch_callback_ +#define __KLP_POST_PATCH_PREFIX __klp_post_patch_callback_ +#define __KLP_PRE_UNPATCH_PREFIX __klp_pre_unpatch_callback_ +#define __KLP_POST_UNPATCH_PREFIX __klp_post_unpatch_callback_ + +#define KLP_PRE_PATCH_PREFIX __stringify(__KLP_PRE_PATCH_PREFIX) +#define KLP_POST_PATCH_PREFIX __stringify(__KLP_POST_PATCH_PREFIX) +#define KLP_PRE_UNPATCH_PREFIX __stringify(__KLP_PRE_UNPATCH_PREFIX) +#define KLP_POST_UNPATCH_PREFIX __stringify(__KLP_POST_UNPATCH_PREFIX) + +struct klp_object; + +typedef int (*klp_pre_patch_t)(struct klp_object *obj); +typedef void (*klp_post_patch_t)(struct klp_object *obj); +typedef void (*klp_pre_unpatch_t)(struct klp_object *obj); +typedef void (*klp_post_unpatch_t)(struct klp_object *obj); + +/** + * struct klp_callbacks - pre/post live-(un)patch callback structure + * @pre_patch: executed before code patching + * @post_patch: executed after code patching + * @pre_unpatch: executed before code unpatching + * @post_unpatch: executed after code unpatching + * @post_unpatch_enabled: flag indicating if post-unpatch callback + * should run + * + * All callbacks are optional. Only the pre-patch callback, if provided, + * will be unconditionally executed. If the parent klp_object fails to + * patch for any reason, including a non-zero error status returned from + * the pre-patch callback, no further callbacks will be executed. + */ +struct klp_callbacks { + klp_pre_patch_t pre_patch; + klp_post_patch_t post_patch; + klp_pre_unpatch_t pre_unpatch; + klp_post_unpatch_t post_unpatch; + bool post_unpatch_enabled; +}; + +/* + * 'struct klp_{func,object}_ext' are compact "external" representations of + * 'struct klp_{func,object}'. They are used by objtool for livepatch + * generation. The structs are then read by the livepatch module and converted + * to the real structs before calling klp_enable_patch(). + * + * TODO make these the official API for klp_enable_patch(). That should + * simplify livepatch's interface as well as its data structure lifetime + * management. + */ +struct klp_func_ext { + const char *old_name; + void *new_func; + unsigned long sympos; +}; + +struct klp_object_ext { + const char *name; + struct klp_func_ext *funcs; + struct klp_callbacks callbacks; + unsigned int nr_funcs; +}; + +#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 7e443c2cf7d4..0044a8125013 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -224,7 +224,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, - ".klp.sym.%55[^.].%511[^,],%lu", + KLP_SYM_PREFIX "%55[^.].%511[^,],%lu", sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", @@ -303,7 +303,7 @@ static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, * See comment in klp_resolve_symbols() for an explanation * of the selected field width value. */ - cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]", + cnt = sscanf(shstrtab + sec->sh_name, KLP_RELOC_SEC_PREFIX "%55[^.]", sec_objname); if (cnt != 1) { pr_err("section %s has an incorrectly formatted name\n", diff --git a/scripts/module.lds.S b/scripts/module.lds.S index 2632c6cb8ebe..3037d5e5527c 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -34,8 +34,16 @@ SECTIONS { __patchable_function_entries : { *(__patchable_function_entries) } + __klp_funcs 0: ALIGN(8) { KEEP(*(__klp_funcs)) } + + __klp_objects 0: ALIGN(8) { + __start_klp_objects = .; + KEEP(*(__klp_objects)) + __stop_klp_objects = .; + } + #ifdef CONFIG_ARCH_USES_CFI_TRAPS - __kcfi_traps : { KEEP(*(.kcfi_traps)) } + __kcfi_traps : { KEEP(*(.kcfi_traps)) } #endif .text : { diff --git a/tools/include/linux/livepatch_external.h b/tools/include/linux/livepatch_external.h new file mode 100644 index 000000000000..138af19b0f5c --- /dev/null +++ b/tools/include/linux/livepatch_external.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * External livepatch interfaces for patch creation tooling + */ + +#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_ +#define _LINUX_LIVEPATCH_EXTERNAL_H_ + +#include + +#define KLP_RELOC_SEC_PREFIX ".klp.rela." +#define KLP_SYM_PREFIX ".klp.sym." + +#define __KLP_PRE_PATCH_PREFIX __klp_pre_patch_callback_ +#define __KLP_POST_PATCH_PREFIX __klp_post_patch_callback_ +#define __KLP_PRE_UNPATCH_PREFIX __klp_pre_unpatch_callback_ +#define __KLP_POST_UNPATCH_PREFIX __klp_post_unpatch_callback_ + +#define KLP_PRE_PATCH_PREFIX __stringify(__KLP_PRE_PATCH_PREFIX) +#define KLP_POST_PATCH_PREFIX __stringify(__KLP_POST_PATCH_PREFIX) +#define KLP_PRE_UNPATCH_PREFIX __stringify(__KLP_PRE_UNPATCH_PREFIX) +#define KLP_POST_UNPATCH_PREFIX __stringify(__KLP_POST_UNPATCH_PREFIX) + +struct klp_object; + +typedef int (*klp_pre_patch_t)(struct klp_object *obj); +typedef void (*klp_post_patch_t)(struct klp_object *obj); +typedef void (*klp_pre_unpatch_t)(struct klp_object *obj); +typedef void (*klp_post_unpatch_t)(struct klp_object *obj); + +/** + * struct klp_callbacks - pre/post live-(un)patch callback structure + * @pre_patch: executed before code patching + * @post_patch: executed after code patching + * @pre_unpatch: executed before code unpatching + * @post_unpatch: executed after code unpatching + * @post_unpatch_enabled: flag indicating if post-unpatch callback + * should run + * + * All callbacks are optional. Only the pre-patch callback, if provided, + * will be unconditionally executed. If the parent klp_object fails to + * patch for any reason, including a non-zero error status returned from + * the pre-patch callback, no further callbacks will be executed. + */ +struct klp_callbacks { + klp_pre_patch_t pre_patch; + klp_post_patch_t post_patch; + klp_pre_unpatch_t pre_unpatch; + klp_post_unpatch_t post_unpatch; + bool post_unpatch_enabled; +}; + +/* + * 'struct klp_{func,object}_ext' are compact "external" representations of + * 'struct klp_{func,object}'. They are used by objtool for livepatch + * generation. The structs are then read by the livepatch module and converted + * to the real structs before calling klp_enable_patch(). + * + * TODO make these the official API for klp_enable_patch(). That should + * simplify livepatch's interface as well as its data structure lifetime + * management. + */ +struct klp_func_ext { + const char *old_name; + void *new_func; + unsigned long sympos; +}; + +struct klp_object_ext { + const char *name; + struct klp_func_ext *funcs; + struct klp_callbacks callbacks; + unsigned int nr_funcs; +}; + +#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */ diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h index 8499f509f03e..51ad3cf4fa82 100644 --- a/tools/include/linux/string.h +++ b/tools/include/linux/string.h @@ -44,6 +44,20 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } +/* + * Checks if a string ends with another. + */ +static inline bool str_ends_with(const char *str, const char *substr) +{ + size_t len = strlen(str); + size_t sublen = strlen(substr); + + if (sublen > len) + return false; + + return !strcmp(str + len - sublen, substr); +} + extern char * __must_check skip_spaces(const char *); extern char *strim(char *); diff --git a/tools/objtool/Build b/tools/objtool/Build index a3cdf8af6635..0b01657671d7 100644 --- a/tools/objtool/Build +++ b/tools/objtool/Build @@ -8,8 +8,8 @@ objtool-y += builtin-check.o objtool-y += elf.o objtool-y += objtool.o -objtool-$(BUILD_ORC) += orc_gen.o -objtool-$(BUILD_ORC) += orc_dump.o +objtool-$(BUILD_ORC) += orc_gen.o orc_dump.o +objtool-$(BUILD_KLP) += builtin-klp.o klp-diff.o objtool-y += libstring.o objtool-y += libctype.o diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 958761c05b7c..48928c9bebef 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -15,13 +15,14 @@ ifeq ($(ARCH_HAS_KLP),y) HAVE_XXHASH = $(shell echo "int main() {}" | \ $(HOSTCC) -xc - -o /dev/null -lxxhash 2> /dev/null && echo y || echo n) ifeq ($(HAVE_XXHASH),y) + BUILD_KLP := y LIBXXHASH_CFLAGS := $(shell $(HOSTPKG_CONFIG) libxxhash --cflags 2>/dev/null) \ -DBUILD_KLP LIBXXHASH_LIBS := $(shell $(HOSTPKG_CONFIG) libxxhash --libs 2>/dev/null || echo -lxxhash) endif endif -export BUILD_ORC +export BUILD_ORC BUILD_KLP ifeq ($(srctree),) srctree := $(patsubst %/,%,$(dir $(CURDIR))) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index b2c320f701f9..5c72beeaa3a7 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -88,6 +88,46 @@ s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc) return phys_to_virt(addend); } +static void scan_for_insn(struct section *sec, unsigned long offset, + unsigned long *insn_off, unsigned int *insn_len) +{ + unsigned long o = 0; + struct insn insn; + + while (1) { + + insn_decode(&insn, sec->data->d_buf + o, sec_size(sec) - o, + INSN_MODE_64); + + if (o + insn.length > offset) { + *insn_off = o; + *insn_len = insn.length; + return; + } + + o += insn.length; + } +} + +u64 arch_adjusted_addend(struct reloc *reloc) +{ + unsigned int type = reloc_type(reloc); + s64 addend = reloc_addend(reloc); + unsigned long insn_off; + unsigned int insn_len; + + if (type == R_X86_64_PLT32) + return addend + 4; + + if (type != R_X86_64_PC32 || !is_text_sec(reloc->sec->base)) + return addend; + + scan_for_insn(reloc->sec->base, reloc_offset(reloc), + &insn_off, &insn_len); + + return addend + insn_off + insn_len - reloc_offset(reloc); +} + unsigned long arch_jump_destination(struct instruction *insn) { return insn->offset + insn->len + insn->immediate; diff --git a/tools/objtool/builtin-klp.c b/tools/objtool/builtin-klp.c new file mode 100644 index 000000000000..9b13dd1182af --- /dev/null +++ b/tools/objtool/builtin-klp.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include +#include + +struct subcmd { + const char *name; + const char *description; + int (*fn)(int, const char **); +}; + +static struct subcmd subcmds[] = { + { "diff", "Generate binary diff of two object files", cmd_klp_diff, }, +}; + +static void cmd_klp_usage(void) +{ + fprintf(stderr, "usage: objtool klp []\n\n"); + fprintf(stderr, "Subcommands:\n"); + + for (int i = 0; i < ARRAY_SIZE(subcmds); i++) { + struct subcmd *cmd = &subcmds[i]; + + fprintf(stderr, " %s\t%s\n", cmd->name, cmd->description); + } + + exit(1); +} + +int cmd_klp(int argc, const char **argv) +{ + argc--; + argv++; + + if (!argc) + cmd_klp_usage(); + + if (argc) { + for (int i = 0; i < ARRAY_SIZE(subcmds); i++) { + struct subcmd *cmd = &subcmds[i]; + + if (!strcmp(cmd->name, argv[0])) + return cmd->fn(argc, argv); + } + } + + cmd_klp_usage(); + return 0; +} diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0f5278127f37..8d17d930d0c8 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -185,20 +185,6 @@ static bool is_sibling_call(struct instruction *insn) return (is_static_jump(insn) && insn_call_dest(insn)); } -/* - * Checks if a string ends with another. - */ -static bool str_ends_with(const char *s, const char *sub) -{ - const int slen = strlen(s); - const int sublen = strlen(sub); - - if (sublen > slen) - return 0; - - return !memcmp(s + slen - sublen, sub, sublen); -} - /* * Checks if a function is a Rust "noreturn" one. */ diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 0119b3b4c554..e1daae0630be 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -288,6 +288,18 @@ struct symbol *find_symbol_by_name(const struct elf *elf, const char *name) return NULL; } +struct symbol *find_global_symbol_by_name(const struct elf *elf, const char *name) +{ + struct symbol *sym; + + elf_hash_for_each_possible(symbol_name, sym, name_hash, str_hash(name)) { + if (!strcmp(sym->name, name) && !is_local_sym(sym)) + return sym; + } + + return NULL; +} + struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len) { @@ -475,6 +487,8 @@ static int elf_add_symbol(struct elf *elf, struct symbol *sym) else entry = &sym->sec->symbol_list; list_add(&sym->list, entry); + + list_add_tail(&sym->global_list, &elf->symbols); elf_hash_add(symbol, &sym->hash, sym->idx); elf_hash_add(symbol_name, &sym->name_hash, str_hash(sym->name)); @@ -531,6 +545,9 @@ static int read_symbols(struct elf *elf) ERROR_GLIBC("calloc"); return -1; } + + INIT_LIST_HEAD(&elf->symbols); + for (i = 0; i < symbols_nr; i++) { sym = &elf->symbol_data[i]; @@ -639,7 +656,7 @@ static int mark_group_syms(struct elf *elf) return -1; } - list_for_each_entry(sec, &elf->sections, list) { + for_each_sec(elf, sec) { if (sec->sh.sh_type == SHT_GROUP && sec->sh.sh_link == symtab->idx) { sym = find_symbol_by_index(elf, sec->sh.sh_info); @@ -1224,6 +1241,8 @@ struct elf *elf_create_file(GElf_Ehdr *ehdr, const char *name) return NULL; } + INIT_LIST_HEAD(&elf->symbols); + if (!elf_alloc_hash(section, 1000) || !elf_alloc_hash(section_name, 1000) || !elf_alloc_hash(symbol, 10000) || diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index a4502947307a..d89f8b5ec14e 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -84,6 +84,7 @@ bool arch_callee_saved_reg(unsigned char reg); unsigned long arch_jump_destination(struct instruction *insn); s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc); +u64 arch_adjusted_addend(struct reloc *reloc); const char *arch_nop_insn(int len); const char *arch_ret_insn(int len); diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index cee9fc031877..bb0b25eb08ba 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -53,4 +53,6 @@ int objtool_run(int argc, const char **argv); int make_backup(void); +int cmd_klp(int argc, const char **argv); + #endif /* _BUILTIN_H */ diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index a1f1762f89c4..e2cd817fca52 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -18,6 +18,7 @@ #include #include +#define SEC_NAME_LEN 1024 #define SYM_NAME_LEN 512 #define bswap_if_needed(elf, val) __bswap_if_needed(&elf->ehdr, val) @@ -53,10 +54,12 @@ struct section { bool _changed, text, rodata, noinstr, init, truncate; struct reloc *relocs; unsigned long nr_alloc_relocs; + struct section *twin; }; struct symbol { struct list_head list; + struct list_head global_list; struct rb_node node; struct elf_hash_node hash; struct elf_hash_node name_hash; @@ -83,10 +86,13 @@ struct symbol { u8 cold : 1; u8 prefix : 1; u8 debug_checksum : 1; + u8 changed : 1; + u8 included : 1; struct list_head pv_target; struct reloc *relocs; struct section *group_sec; struct checksum csum; + struct symbol *twin, *clone; }; struct reloc { @@ -104,6 +110,7 @@ struct elf { const char *name, *tmp_name; unsigned int num_files; struct list_head sections; + struct list_head symbols; unsigned long num_relocs; int symbol_bits; @@ -179,6 +186,7 @@ struct section *find_section_by_name(const struct elf *elf, const char *name); struct symbol *find_func_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_name(const struct elf *elf, const char *name); +struct symbol *find_global_symbol_by_name(const struct elf *elf, const char *name); struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset); int find_symbol_hole_containing(const struct section *sec, unsigned long offset); struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset); @@ -448,22 +456,48 @@ static inline void set_sym_next_reloc(struct reloc *reloc, struct reloc *next) #define sec_for_each_sym(sec, sym) \ list_for_each_entry(sym, &sec->symbol_list, list) +#define sec_prev_sym(sym) \ + sym->sec && sym->list.prev != &sym->sec->symbol_list ? \ + list_prev_entry(sym, list) : NULL + #define for_each_sym(elf, sym) \ - for (struct section *__sec, *__fake = (struct section *)1; \ - __fake; __fake = NULL) \ - for_each_sec(elf, __sec) \ - sec_for_each_sym(__sec, sym) + list_for_each_entry(sym, &elf->symbols, global_list) + +#define for_each_sym_continue(elf, sym) \ + list_for_each_entry_continue(sym, &elf->symbols, global_list) + +#define rsec_next_reloc(rsec, reloc) \ + reloc_idx(reloc) < sec_num_entries(rsec) - 1 ? reloc + 1 : NULL #define for_each_reloc(rsec, reloc) \ - for (int __i = 0, __fake = 1; __fake; __fake = 0) \ - for (reloc = rsec->relocs; \ - __i < sec_num_entries(rsec); \ - __i++, reloc++) + for (reloc = rsec->relocs; reloc; reloc = rsec_next_reloc(rsec, reloc)) #define for_each_reloc_from(rsec, reloc) \ - for (int __i = reloc_idx(reloc); \ - __i < sec_num_entries(rsec); \ - __i++, reloc++) + for (; reloc; reloc = rsec_next_reloc(rsec, reloc)) + +#define for_each_reloc_continue(rsec, reloc) \ + for (reloc = rsec_next_reloc(rsec, reloc); reloc; \ + reloc = rsec_next_reloc(rsec, reloc)) + +#define sym_for_each_reloc(elf, sym, reloc) \ + for (reloc = find_reloc_by_dest_range(elf, sym->sec, \ + sym->offset, sym->len); \ + reloc && reloc_offset(reloc) < sym->offset + sym->len; \ + reloc = rsec_next_reloc(sym->sec->rsec, reloc)) + +static inline struct symbol *get_func_prefix(struct symbol *func) +{ + struct symbol *prev; + + if (!is_func_sym(func)) + return NULL; + + prev = sec_prev_sym(func); + if (prev && is_prefix_func(prev)) + return prev; + + return NULL; +} #define OFFSET_STRIDE_BITS 4 #define OFFSET_STRIDE (1UL << OFFSET_STRIDE_BITS) diff --git a/tools/objtool/include/objtool/klp.h b/tools/objtool/include/objtool/klp.h new file mode 100644 index 000000000000..07928fac059b --- /dev/null +++ b/tools/objtool/include/objtool/klp.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_KLP_H +#define _OBJTOOL_KLP_H + +/* + * __klp_objects and __klp_funcs are created by klp diff and used by the patch + * module init code to build the klp_patch, klp_object and klp_func structs + * needed by the livepatch API. + */ +#define KLP_OBJECTS_SEC "__klp_objects" +#define KLP_FUNCS_SEC "__klp_funcs" + +/* + * __klp_relocs is an intermediate section which are created by klp diff and + * converted into KLP symbols/relas by "objtool klp post-link". This is needed + * to work around the linker, which doesn't preserve SHN_LIVEPATCH or + * SHF_RELA_LIVEPATCH, nor does it support having two RELA sections for a + * single PROGBITS section. + */ +#define KLP_RELOCS_SEC "__klp_relocs" +#define KLP_STRINGS_SEC ".rodata.klp.str1.1" + +struct klp_reloc { + void *offset; + void *sym; + u32 type; +}; + +int cmd_klp_diff(int argc, const char **argv); + +#endif /* _OBJTOOL_KLP_H */ diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index c0dc86a78ff6..7f70b41d1b8d 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -39,6 +39,8 @@ struct objtool_file { struct pv_state *pv_ops; }; +char *top_level_dir(const char *file); + struct objtool_file *objtool_open_read(const char *_objname); int objtool_pv_add(struct objtool_file *file, int idx, struct symbol *func); diff --git a/tools/objtool/include/objtool/util.h b/tools/objtool/include/objtool/util.h new file mode 100644 index 000000000000..a0180b312f73 --- /dev/null +++ b/tools/objtool/include/objtool/util.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _UTIL_H +#define _UTIL_H + +#include + +#define snprintf_check(str, size, format, args...) \ +({ \ + int __ret = snprintf(str, size, format, args); \ + if (__ret < 0) \ + ERROR_GLIBC("snprintf"); \ + else if (__ret >= size) \ + ERROR("snprintf() failed for '" format "'", args); \ + else \ + __ret = 0; \ + __ret; \ +}) + +#endif /* _UTIL_H */ diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c new file mode 100644 index 000000000000..0d69b621a26c --- /dev/null +++ b/tools/objtool/klp-diff.c @@ -0,0 +1,1646 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#define _GNU_SOURCE /* memmem() */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) + +struct elfs { + struct elf *orig, *patched, *out; + const char *modname; +}; + +struct export { + struct hlist_node hash; + char *mod, *sym; +}; + +static const char * const klp_diff_usage[] = { + "objtool klp diff [] ", + NULL, +}; + +static const struct option klp_diff_options[] = { + OPT_END(), +}; + +static DEFINE_HASHTABLE(exports, 15); + +static inline u32 str_hash(const char *str) +{ + return jhash(str, strlen(str), 0); +} + +static int read_exports(void) +{ + const char *symvers = "Module.symvers"; + char line[1024], *path = NULL; + unsigned int line_num = 1; + FILE *file; + + file = fopen(symvers, "r"); + if (!file) { + path = top_level_dir(symvers); + if (!path) { + ERROR("can't open '%s', \"objtool diff\" should be run from the kernel tree", symvers); + return -1; + } + + file = fopen(path, "r"); + if (!file) { + ERROR_GLIBC("fopen"); + return -1; + } + } + + while (fgets(line, 1024, file)) { + char *sym, *mod, *type; + struct export *export; + + sym = strchr(line, '\t'); + if (!sym) { + ERROR("malformed Module.symvers (sym) at line %d", line_num); + return -1; + } + + *sym++ = '\0'; + + mod = strchr(sym, '\t'); + if (!mod) { + ERROR("malformed Module.symvers (mod) at line %d", line_num); + return -1; + } + + *mod++ = '\0'; + + type = strchr(mod, '\t'); + if (!type) { + ERROR("malformed Module.symvers (type) at line %d", line_num); + return -1; + } + + *type++ = '\0'; + + if (*sym == '\0' || *mod == '\0') { + ERROR("malformed Module.symvers at line %d", line_num); + return -1; + } + + export = calloc(1, sizeof(*export)); + if (!export) { + ERROR_GLIBC("calloc"); + return -1; + } + + export->mod = strdup(mod); + if (!export->mod) { + ERROR_GLIBC("strdup"); + return -1; + } + + export->sym = strdup(sym); + if (!export->sym) { + ERROR_GLIBC("strdup"); + return -1; + } + + hash_add(exports, &export->hash, str_hash(sym)); + } + + free(path); + fclose(file); + + return 0; +} + +static int read_sym_checksums(struct elf *elf) +{ + struct section *sec; + + sec = find_section_by_name(elf, ".discard.sym_checksum"); + if (!sec) { + ERROR("'%s' missing .discard.sym_checksum section, file not processed by 'objtool --checksum'?", + elf->name); + return -1; + } + + if (!sec->rsec) { + ERROR("missing reloc section for .discard.sym_checksum"); + return -1; + } + + if (sec_size(sec) % sizeof(struct sym_checksum)) { + ERROR("struct sym_checksum size mismatch"); + return -1; + } + + for (int i = 0; i < sec_size(sec) / sizeof(struct sym_checksum); i++) { + struct sym_checksum *sym_checksum; + struct reloc *reloc; + struct symbol *sym; + + sym_checksum = (struct sym_checksum *)sec->data->d_buf + i; + + reloc = find_reloc_by_dest(elf, sec, i * sizeof(*sym_checksum)); + if (!reloc) { + ERROR("can't find reloc for sym_checksum[%d]", i); + return -1; + } + + sym = reloc->sym; + + if (is_sec_sym(sym)) { + ERROR("not sure how to handle section %s", sym->name); + return -1; + } + + if (is_func_sym(sym)) + sym->csum.checksum = sym_checksum->checksum; + } + + return 0; +} + +static struct symbol *first_file_symbol(struct elf *elf) +{ + struct symbol *sym; + + for_each_sym(elf, sym) { + if (is_file_sym(sym)) + return sym; + } + + return NULL; +} + +static struct symbol *next_file_symbol(struct elf *elf, struct symbol *sym) +{ + for_each_sym_continue(elf, sym) { + if (is_file_sym(sym)) + return sym; + } + + return NULL; +} + +/* + * Certain static local variables should never be correlated. They will be + * used in place rather than referencing the originals. + */ +static bool is_uncorrelated_static_local(struct symbol *sym) +{ + static const char * const vars[] = { + "__key.", + "__warned.", + "__already_done.", + "__func__.", + "_rs.", + "descriptor.", + "CSWTCH.", + }; + + if (!is_object_sym(sym) || !is_local_sym(sym)) + return false; + + if (!strcmp(sym->sec->name, ".data.once")) + return true; + + for (int i = 0; i < ARRAY_SIZE(vars); i++) { + if (strstarts(sym->name, vars[i])) + return true; + } + + return false; +} + +/* + * Clang emits several useless .Ltmp_* code labels. + */ +static bool is_clang_tmp_label(struct symbol *sym) +{ + return sym->type == STT_NOTYPE && + is_text_sec(sym->sec) && + strstarts(sym->name, ".Ltmp") && + isdigit(sym->name[5]); +} + +static bool is_special_section(struct section *sec) +{ + static const char * const specials[] = { + ".altinstructions", + ".smp_locks", + "__bug_table", + "__ex_table", + "__jump_table", + "__mcount_loc", + + /* + * Extract .static_call_sites here to inherit non-module + * preferential treatment. The later static call processing + * during klp module build will be skipped when it sees this + * section already exists. + */ + ".static_call_sites", + }; + + static const char * const non_special_discards[] = { + ".discard.addressable", + ".discard.sym_checksum", + }; + + if (is_text_sec(sec)) + return false; + + for (int i = 0; i < ARRAY_SIZE(specials); i++) { + if (!strcmp(sec->name, specials[i])) + return true; + } + + /* Most .discard data sections are special */ + for (int i = 0; i < ARRAY_SIZE(non_special_discards); i++) { + if (!strcmp(sec->name, non_special_discards[i])) + return false; + } + + return strstarts(sec->name, ".discard."); +} + +/* + * These sections are referenced by special sections but aren't considered + * special sections themselves. + */ +static bool is_special_section_aux(struct section *sec) +{ + static const char * const specials_aux[] = { + ".altinstr_replacement", + ".altinstr_aux", + }; + + for (int i = 0; i < ARRAY_SIZE(specials_aux); i++) { + if (!strcmp(sec->name, specials_aux[i])) + return true; + } + + return false; +} + +/* + * These symbols should never be correlated, so their local patched versions + * are used instead of linking to the originals. + */ +static bool dont_correlate(struct symbol *sym) +{ + return is_file_sym(sym) || + is_null_sym(sym) || + is_sec_sym(sym) || + is_prefix_func(sym) || + is_uncorrelated_static_local(sym) || + is_clang_tmp_label(sym) || + is_string_sec(sym->sec) || + is_special_section(sym->sec) || + is_special_section_aux(sym->sec) || + strstarts(sym->name, "__initcall__"); +} + +/* + * For each symbol in the original kernel, find its corresponding "twin" in the + * patched kernel. + */ +static int correlate_symbols(struct elfs *e) +{ + struct symbol *file1_sym, *file2_sym; + struct symbol *sym1, *sym2; + + /* Correlate locals */ + for (file1_sym = first_file_symbol(e->orig), + file2_sym = first_file_symbol(e->patched); ; + file1_sym = next_file_symbol(e->orig, file1_sym), + file2_sym = next_file_symbol(e->patched, file2_sym)) { + + if (!file1_sym && file2_sym) { + ERROR("FILE symbol mismatch: NULL != %s", file2_sym->name); + return -1; + } + + if (file1_sym && !file2_sym) { + ERROR("FILE symbol mismatch: %s != NULL", file1_sym->name); + return -1; + } + + if (!file1_sym) + break; + + if (strcmp(file1_sym->name, file2_sym->name)) { + ERROR("FILE symbol mismatch: %s != %s", file1_sym->name, file2_sym->name); + return -1; + } + + file1_sym->twin = file2_sym; + file2_sym->twin = file1_sym; + + sym1 = file1_sym; + + for_each_sym_continue(e->orig, sym1) { + if (is_file_sym(sym1) || !is_local_sym(sym1)) + break; + + if (dont_correlate(sym1)) + continue; + + sym2 = file2_sym; + for_each_sym_continue(e->patched, sym2) { + if (is_file_sym(sym2) || !is_local_sym(sym2)) + break; + + if (sym2->twin || dont_correlate(sym2)) + continue; + + if (strcmp(sym1->demangled_name, sym2->demangled_name)) + continue; + + sym1->twin = sym2; + sym2->twin = sym1; + break; + } + } + } + + /* Correlate globals */ + for_each_sym(e->orig, sym1) { + if (sym1->bind == STB_LOCAL) + continue; + + sym2 = find_global_symbol_by_name(e->patched, sym1->name); + + if (sym2 && !sym2->twin && !strcmp(sym1->name, sym2->name)) { + sym1->twin = sym2; + sym2->twin = sym1; + } + } + + for_each_sym(e->orig, sym1) { + if (sym1->twin || dont_correlate(sym1)) + continue; + WARN("no correlation: %s", sym1->name); + } + + return 0; +} + +/* "sympos" is used by livepatch to disambiguate duplicate symbol names */ +static unsigned long find_sympos(struct elf *elf, struct symbol *sym) +{ + bool vmlinux = str_ends_with(objname, "vmlinux.o"); + unsigned long sympos = 0, nr_matches = 0; + bool has_dup = false; + struct symbol *s; + + if (sym->bind != STB_LOCAL) + return 0; + + if (vmlinux && sym->type == STT_FUNC) { + /* + * HACK: Unfortunately, symbol ordering can differ between + * vmlinux.o and vmlinux due to the linker script emitting + * .text.unlikely* before .text*. Count .text.unlikely* first. + * + * TODO: Disambiguate symbols more reliably (checksums?) + */ + for_each_sym(elf, s) { + if (strstarts(s->sec->name, ".text.unlikely") && + !strcmp(s->name, sym->name)) { + nr_matches++; + if (s == sym) + sympos = nr_matches; + else + has_dup = true; + } + } + for_each_sym(elf, s) { + if (!strstarts(s->sec->name, ".text.unlikely") && + !strcmp(s->name, sym->name)) { + nr_matches++; + if (s == sym) + sympos = nr_matches; + else + has_dup = true; + } + } + } else { + for_each_sym(elf, s) { + if (!strcmp(s->name, sym->name)) { + nr_matches++; + if (s == sym) + sympos = nr_matches; + else + has_dup = true; + } + } + } + + if (!sympos) { + ERROR("can't find sympos for %s", sym->name); + return ULONG_MAX; + } + + return has_dup ? sympos : 0; +} + +static int clone_sym_relocs(struct elfs *e, struct symbol *patched_sym); + +static struct symbol *__clone_symbol(struct elf *elf, struct symbol *patched_sym, + bool data_too) +{ + struct section *out_sec = NULL; + unsigned long offset = 0; + struct symbol *out_sym; + + if (data_too && !is_undef_sym(patched_sym)) { + struct section *patched_sec = patched_sym->sec; + + out_sec = find_section_by_name(elf, patched_sec->name); + if (!out_sec) { + out_sec = elf_create_section(elf, patched_sec->name, 0, + patched_sec->sh.sh_entsize, + patched_sec->sh.sh_type, + patched_sec->sh.sh_addralign, + patched_sec->sh.sh_flags); + if (!out_sec) + return NULL; + } + + if (is_string_sec(patched_sym->sec)) { + out_sym = elf_create_section_symbol(elf, out_sec); + if (!out_sym) + return NULL; + + goto sym_created; + } + + if (!is_sec_sym(patched_sym)) + offset = sec_size(out_sec); + + if (patched_sym->len || is_sec_sym(patched_sym)) { + void *data = NULL; + size_t size; + + /* bss doesn't have data */ + if (patched_sym->sec->data->d_buf) + data = patched_sym->sec->data->d_buf + patched_sym->offset; + + if (is_sec_sym(patched_sym)) + size = sec_size(patched_sym->sec); + else + size = patched_sym->len; + + if (!elf_add_data(elf, out_sec, data, size)) + return NULL; + } + } + + out_sym = elf_create_symbol(elf, patched_sym->name, out_sec, + patched_sym->bind, patched_sym->type, + offset, patched_sym->len); + if (!out_sym) + return NULL; + +sym_created: + patched_sym->clone = out_sym; + out_sym->clone = patched_sym; + + return out_sym; +} + +/* + * Copy a symbol to the output object, optionally including its data and + * relocations. + */ +static struct symbol *clone_symbol(struct elfs *e, struct symbol *patched_sym, + bool data_too) +{ + struct symbol *pfx; + + if (patched_sym->clone) + return patched_sym->clone; + + /* Make sure the prefix gets cloned first */ + if (is_func_sym(patched_sym) && data_too) { + pfx = get_func_prefix(patched_sym); + if (pfx) + clone_symbol(e, pfx, true); + } + + if (!__clone_symbol(e->out, patched_sym, data_too)) + return NULL; + + if (data_too && clone_sym_relocs(e, patched_sym)) + return NULL; + + return patched_sym->clone; +} + +static void mark_included_function(struct symbol *func) +{ + struct symbol *pfx; + + func->included = 1; + + /* Include prefix function */ + pfx = get_func_prefix(func); + if (pfx) + pfx->included = 1; + + /* Make sure .cold parent+child always stay together */ + if (func->cfunc && func->cfunc != func) + func->cfunc->included = 1; + if (func->pfunc && func->pfunc != func) + func->pfunc->included = 1; +} + +/* + * Copy all changed functions (and their dependencies) from the patched object + * to the output object. + */ +static int mark_changed_functions(struct elfs *e) +{ + struct symbol *sym_orig, *patched_sym; + bool changed = false; + + /* Find changed functions */ + for_each_sym(e->orig, sym_orig) { + if (!is_func_sym(sym_orig) || is_prefix_func(sym_orig)) + continue; + + patched_sym = sym_orig->twin; + if (!patched_sym) + continue; + + if (sym_orig->csum.checksum != patched_sym->csum.checksum) { + patched_sym->changed = 1; + mark_included_function(patched_sym); + changed = true; + } + } + + /* Find added functions and print them */ + for_each_sym(e->patched, patched_sym) { + if (!is_func_sym(patched_sym) || is_prefix_func(patched_sym)) + continue; + + if (!patched_sym->twin) { + printf("%s: new function: %s\n", objname, patched_sym->name); + mark_included_function(patched_sym); + changed = true; + } + } + + /* Print changed functions */ + for_each_sym(e->patched, patched_sym) { + if (patched_sym->changed) + printf("%s: changed function: %s\n", objname, patched_sym->name); + } + + return !changed ? -1 : 0; +} + +static int clone_included_functions(struct elfs *e) +{ + struct symbol *patched_sym; + + for_each_sym(e->patched, patched_sym) { + if (patched_sym->included) { + if (!clone_symbol(e, patched_sym, true)) + return -1; + } + } + + return 0; +} + +/* + * Determine whether a relocation should reference the section rather than the + * underlying symbol. + */ +static bool section_reference_needed(struct section *sec) +{ + /* + * String symbols are zero-length and uncorrelated. It's easier to + * deal with them as section symbols. + */ + if (is_string_sec(sec)) + return true; + + /* + * .rodata has mostly anonymous data so there's no way to determine the + * length of a needed reference. just copy the whole section if needed. + */ + if (strstarts(sec->name, ".rodata")) + return true; + + /* UBSAN anonymous data */ + if (strstarts(sec->name, ".data..Lubsan") || /* GCC */ + strstarts(sec->name, ".data..L__unnamed_")) /* Clang */ + return true; + + return false; +} + +static bool is_reloc_allowed(struct reloc *reloc) +{ + return section_reference_needed(reloc->sym->sec) == is_sec_sym(reloc->sym); +} + +static struct export *find_export(struct symbol *sym) +{ + struct export *export; + + hash_for_each_possible(exports, export, hash, str_hash(sym->name)) { + if (!strcmp(export->sym, sym->name)) + return export; + } + + return NULL; +} + +static const char *__find_modname(struct elfs *e) +{ + struct section *sec; + char *name; + + sec = find_section_by_name(e->orig, ".modinfo"); + if (!sec) { + ERROR("missing .modinfo section"); + return NULL; + } + + name = memmem(sec->data->d_buf, sec_size(sec), "\0name=", 6); + if (name) + return name + 6; + + name = strdup(e->orig->name); + if (!name) { + ERROR_GLIBC("strdup"); + return NULL; + } + + for (char *c = name; *c; c++) { + if (*c == '/') + name = c + 1; + else if (*c == '-') + *c = '_'; + else if (*c == '.') { + *c = '\0'; + break; + } + } + + return name; +} + +/* Get the object's module name as defined by the kernel (and klp_object) */ +static const char *find_modname(struct elfs *e) +{ + const char *modname; + + if (e->modname) + return e->modname; + + modname = __find_modname(e); + e->modname = modname; + return modname; +} + +/* + * Copying a function from its native compiled environment to a kernel module + * removes its natural access to local functions/variables and unexported + * globals. References to such symbols need to be converted to KLP relocs so + * the kernel arch relocation code knows to apply them and where to find the + * symbols. Particularly, duplicate static symbols need to be disambiguated. + */ +static bool klp_reloc_needed(struct reloc *patched_reloc) +{ + struct symbol *patched_sym = patched_reloc->sym; + struct export *export; + + /* no external symbol to reference */ + if (dont_correlate(patched_sym)) + return false; + + /* For included functions, a regular reloc will do. */ + if (patched_sym->included) + return false; + + /* + * If exported by a module, it has to be a klp reloc. Thanks to the + * clusterfunk that is late module patching, the patch module is + * allowed to be loaded before any modules it depends on. + * + * If exported by vmlinux, a normal reloc will do. + */ + export = find_export(patched_sym); + if (export) + return strcmp(export->mod, "vmlinux"); + + if (!patched_sym->twin) { + /* + * Presumably the symbol and its reference were added by the + * patch. The symbol could be defined in this .o or in another + * .o in the patch module. + * + * This check needs to be *after* the export check due to the + * possibility of the patch adding a new UNDEF reference to an + * exported symbol. + */ + return false; + } + + /* Unexported symbol which lives in the original vmlinux or module. */ + return true; +} + +static int convert_reloc_sym_to_secsym(struct elf *elf, struct reloc *reloc) +{ + struct symbol *sym = reloc->sym; + struct section *sec = sym->sec; + + if (!sec->sym && !elf_create_section_symbol(elf, sec)) + return -1; + + reloc->sym = sec->sym; + set_reloc_sym(elf, reloc, sym->idx); + set_reloc_addend(elf, reloc, sym->offset + reloc_addend(reloc)); + return 0; +} + +static int convert_reloc_secsym_to_sym(struct elf *elf, struct reloc *reloc) +{ + struct symbol *sym = reloc->sym; + struct section *sec = sym->sec; + + /* If the symbol has a dedicated section, it's easy to find */ + sym = find_symbol_by_offset(sec, 0); + if (sym && sym->len == sec_size(sec)) + goto found_sym; + + /* No dedicated section; find the symbol manually */ + sym = find_symbol_containing(sec, arch_adjusted_addend(reloc)); + if (!sym) { + /* + * This can happen for special section references to weak code + * whose symbol has been stripped by the linker. + */ + return -1; + } + +found_sym: + reloc->sym = sym; + set_reloc_sym(elf, reloc, sym->idx); + set_reloc_addend(elf, reloc, reloc_addend(reloc) - sym->offset); + return 0; +} + +/* + * Convert a relocation symbol reference to the needed format: either a section + * symbol or the underlying symbol itself. + */ +static int convert_reloc_sym(struct elf *elf, struct reloc *reloc) +{ + if (is_reloc_allowed(reloc)) + return 0; + + if (section_reference_needed(reloc->sym->sec)) + return convert_reloc_sym_to_secsym(elf, reloc); + else + return convert_reloc_secsym_to_sym(elf, reloc); +} + +/* + * Convert a regular relocation to a klp relocation (sort of). + */ +static int clone_reloc_klp(struct elfs *e, struct reloc *patched_reloc, + struct section *sec, unsigned long offset, + struct export *export) +{ + struct symbol *patched_sym = patched_reloc->sym; + s64 addend = reloc_addend(patched_reloc); + const char *sym_modname, *sym_orig_name; + static struct section *klp_relocs; + struct symbol *sym, *klp_sym; + unsigned long klp_reloc_off; + char sym_name[SYM_NAME_LEN]; + struct klp_reloc klp_reloc; + unsigned long sympos; + + if (!patched_sym->twin) { + ERROR("unexpected klp reloc for new symbol %s", patched_sym->name); + return -1; + } + + /* + * Keep the original reloc intact for now to avoid breaking objtool run + * which relies on proper relocations for many of its features. This + * will be disabled later by "objtool klp post-link". + * + * Convert it to UNDEF (and WEAK to avoid modpost warnings). + */ + + sym = patched_sym->clone; + if (!sym) { + /* STB_WEAK: avoid modpost undefined symbol warnings */ + sym = elf_create_symbol(e->out, patched_sym->name, NULL, + STB_WEAK, patched_sym->type, 0, 0); + if (!sym) + return -1; + + patched_sym->clone = sym; + sym->clone = patched_sym; + } + + if (!elf_create_reloc(e->out, sec, offset, sym, addend, reloc_type(patched_reloc))) + return -1; + + /* + * Create the KLP symbol. + */ + + if (export) { + sym_modname = export->mod; + sym_orig_name = export->sym; + sympos = 0; + } else { + sym_modname = find_modname(e); + if (!sym_modname) + return -1; + + sym_orig_name = patched_sym->twin->name; + sympos = find_sympos(e->orig, patched_sym->twin); + if (sympos == ULONG_MAX) + return -1; + } + + /* symbol format: .klp.sym.modname.sym_name,sympos */ + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_SYM_PREFIX "%s.%s,%ld", + sym_modname, sym_orig_name, sympos)) + return -1; + + klp_sym = find_symbol_by_name(e->out, sym_name); + if (!klp_sym) { + /* STB_WEAK: avoid modpost undefined symbol warnings */ + klp_sym = elf_create_symbol(e->out, sym_name, NULL, + STB_WEAK, patched_sym->type, 0, 0); + if (!klp_sym) + return -1; + } + + /* + * Create the __klp_relocs entry. This will be converted to an actual + * KLP rela by "objtool klp post-link". + * + * This intermediate step is necessary to prevent corruption by the + * linker, which doesn't know how to properly handle two rela sections + * applying to the same base section. + */ + + if (!klp_relocs) { + klp_relocs = elf_create_section(e->out, KLP_RELOCS_SEC, 0, + 0, SHT_PROGBITS, 8, SHF_ALLOC); + if (!klp_relocs) + return -1; + } + + klp_reloc_off = sec_size(klp_relocs); + memset(&klp_reloc, 0, sizeof(klp_reloc)); + + klp_reloc.type = reloc_type(patched_reloc); + if (!elf_add_data(e->out, klp_relocs, &klp_reloc, sizeof(klp_reloc))) + return -1; + + /* klp_reloc.offset */ + if (!sec->sym && !elf_create_section_symbol(e->out, sec)) + return -1; + + if (!elf_create_reloc(e->out, klp_relocs, + klp_reloc_off + offsetof(struct klp_reloc, offset), + sec->sym, offset, R_ABS64)) + return -1; + + /* klp_reloc.sym */ + if (!elf_create_reloc(e->out, klp_relocs, + klp_reloc_off + offsetof(struct klp_reloc, sym), + klp_sym, addend, R_ABS64)) + return -1; + + return 0; +} + +/* Copy a reloc and its symbol to the output object */ +static int clone_reloc(struct elfs *e, struct reloc *patched_reloc, + struct section *sec, unsigned long offset) +{ + struct symbol *patched_sym = patched_reloc->sym; + struct export *export = find_export(patched_sym); + long addend = reloc_addend(patched_reloc); + struct symbol *out_sym; + bool klp; + + if (!is_reloc_allowed(patched_reloc)) { + ERROR_FUNC(patched_reloc->sec->base, reloc_offset(patched_reloc), + "missing symbol for reference to %s+%ld", + patched_sym->name, addend); + return -1; + } + + klp = klp_reloc_needed(patched_reloc); + + if (klp) { + if (clone_reloc_klp(e, patched_reloc, sec, offset, export)) + return -1; + + return 0; + } + + /* + * Why !export sets 'data_too': + * + * Unexported non-klp symbols need to live in the patch module, + * otherwise there will be unresolved symbols. Notably, this includes: + * + * - New functions/data + * - String sections + * - Special section entries + * - Uncorrelated static local variables + * - UBSAN sections + */ + out_sym = clone_symbol(e, patched_sym, patched_sym->included || !export); + if (!out_sym) + return -1; + + /* + * For strings, all references use section symbols, thanks to + * section_reference_needed(). clone_symbol() has cloned an empty + * version of the string section. Now copy the string itself. + */ + if (is_string_sec(patched_sym->sec)) { + const char *str = patched_sym->sec->data->d_buf + addend; + + addend = elf_add_string(e->out, out_sym->sec, str); + if (addend == -1) + return -1; + } + + if (!elf_create_reloc(e->out, sec, offset, out_sym, addend, + reloc_type(patched_reloc))) + return -1; + + return 0; +} + +/* Copy all relocs needed for a symbol's contents */ +static int clone_sym_relocs(struct elfs *e, struct symbol *patched_sym) +{ + struct section *patched_rsec = patched_sym->sec->rsec; + struct reloc *patched_reloc; + unsigned long start, end; + struct symbol *out_sym; + + out_sym = patched_sym->clone; + if (!out_sym) { + ERROR("no clone for %s", patched_sym->name); + return -1; + } + + if (!patched_rsec) + return 0; + + if (!is_sec_sym(patched_sym) && !patched_sym->len) + return 0; + + if (is_string_sec(patched_sym->sec)) + return 0; + + if (is_sec_sym(patched_sym)) { + start = 0; + end = sec_size(patched_sym->sec); + } else { + start = patched_sym->offset; + end = start + patched_sym->len; + } + + for_each_reloc(patched_rsec, patched_reloc) { + unsigned long offset; + + if (reloc_offset(patched_reloc) < start || + reloc_offset(patched_reloc) >= end) + continue; + + /* + * Skip any reloc referencing .altinstr_aux. Its code is + * always patched by alternatives. See ALTERNATIVE_TERNARY(). + */ + if (patched_reloc->sym->sec && + !strcmp(patched_reloc->sym->sec->name, ".altinstr_aux")) + continue; + + if (convert_reloc_sym(e->patched, patched_reloc)) { + ERROR_FUNC(patched_rsec->base, reloc_offset(patched_reloc), + "failed to convert reloc sym '%s' to its proper format", + patched_reloc->sym->name); + return -1; + } + + offset = out_sym->offset + (reloc_offset(patched_reloc) - patched_sym->offset); + + if (clone_reloc(e, patched_reloc, out_sym->sec, offset)) + return -1; + } + return 0; + +} + +static int create_fake_symbol(struct elf *elf, struct section *sec, + unsigned long offset, size_t size) +{ + char name[SYM_NAME_LEN]; + unsigned int type; + static int ctr; + char *c; + + if (snprintf_check(name, SYM_NAME_LEN, "%s_%d", sec->name, ctr++)) + return -1; + + for (c = name; *c; c++) + if (*c == '.') + *c = '_'; + + /* + * STT_NOTYPE: Prevent objtool from validating .altinstr_replacement + * while still allowing objdump to disassemble it. + */ + type = is_text_sec(sec) ? STT_NOTYPE : STT_OBJECT; + return elf_create_symbol(elf, name, sec, STB_LOCAL, type, offset, size) ? 0 : -1; +} + +/* + * Special sections (alternatives, etc) are basically arrays of structs. + * For all the special sections, create a symbol for each struct entry. This + * is a bit cumbersome, but it makes the extracting of the individual entries + * much more straightforward. + * + * There are three ways to identify the entry sizes for a special section: + * + * 1) ELF section header sh_entsize: Ideally this would be used almost + * everywhere. But unfortunately the toolchains make it difficult. The + * assembler .[push]section directive syntax only takes entsize when + * combined with SHF_MERGE. But Clang disallows combining SHF_MERGE with + * SHF_WRITE. And some special sections do need to be writable. + * + * Another place this wouldn't work is .altinstr_replacement, whose entries + * don't have a fixed size. + * + * 2) ANNOTATE_DATA_SPECIAL: This is a lightweight objtool annotation which + * points to the beginning of each entry. The size of the entry is then + * inferred by the location of the subsequent annotation (or end of + * section). + * + * 3) Simple array of pointers: If the special section is just a basic array of + * pointers, the entry size can be inferred by the number of relocations. + * No annotations needed. + * + * Note I also tried to create per-entry symbols at the time of creation, in + * the original [inline] asm. Unfortunately, creating uniquely named symbols + * is trickier than one might think, especially with Clang inline asm. I + * eventually just gave up trying to make that work, in favor of using + * ANNOTATE_DATA_SPECIAL and creating the symbols here after the fact. + */ +static int create_fake_symbols(struct elf *elf) +{ + struct section *sec; + struct reloc *reloc; + + /* + * 1) Make symbols for all the ANNOTATE_DATA_SPECIAL entries: + */ + + sec = find_section_by_name(elf, ".discard.annotate_data"); + if (!sec || !sec->rsec) + return 0; + + for_each_reloc(sec->rsec, reloc) { + unsigned long offset, size; + struct reloc *next_reloc; + + if (annotype(elf, sec, reloc) != ANNOTYPE_DATA_SPECIAL) + continue; + + offset = reloc_addend(reloc); + + size = 0; + next_reloc = reloc; + for_each_reloc_continue(sec->rsec, next_reloc) { + if (annotype(elf, sec, next_reloc) != ANNOTYPE_DATA_SPECIAL || + next_reloc->sym->sec != reloc->sym->sec) + continue; + + size = reloc_addend(next_reloc) - offset; + break; + } + + if (!size) + size = sec_size(reloc->sym->sec) - offset; + + if (create_fake_symbol(elf, reloc->sym->sec, offset, size)) + return -1; + } + + /* + * 2) Make symbols for sh_entsize, and simple arrays of pointers: + */ + + for_each_sec(elf, sec) { + unsigned int entry_size; + unsigned long offset; + + if (!is_special_section(sec) || find_symbol_by_offset(sec, 0)) + continue; + + if (!sec->rsec) { + ERROR("%s: missing special section relocations", sec->name); + return -1; + } + + entry_size = sec->sh.sh_entsize; + if (!entry_size) { + entry_size = arch_reloc_size(sec->rsec->relocs); + if (sec_size(sec) != entry_size * sec_num_entries(sec->rsec)) { + ERROR("%s: missing special section entsize or annotations", sec->name); + return -1; + } + } + + for (offset = 0; offset < sec_size(sec); offset += entry_size) { + if (create_fake_symbol(elf, sec, offset, entry_size)) + return -1; + } + } + + return 0; +} + +/* Keep a special section entry if it references an included function */ +static bool should_keep_special_sym(struct elf *elf, struct symbol *sym) +{ + struct reloc *reloc; + + if (is_sec_sym(sym) || !sym->sec->rsec) + return false; + + sym_for_each_reloc(elf, sym, reloc) { + if (convert_reloc_sym(elf, reloc)) + continue; + + if (is_func_sym(reloc->sym) && reloc->sym->included) + return true; + } + + return false; +} + +/* + * Klp relocations aren't allowed for __jump_table and .static_call_sites if + * the referenced symbol lives in a kernel module, because such klp relocs may + * be applied after static branch/call init, resulting in code corruption. + * + * Validate a special section entry to avoid that. Note that an inert + * tracepoint is harmless enough, in that case just skip the entry and print a + * warning. Otherwise, return an error. + * + * This is only a temporary limitation which will be fixed when livepatch adds + * support for submodules: fully self-contained modules which are embedded in + * the top-level livepatch module's data and which can be loaded on demand when + * their corresponding to-be-patched module gets loaded. Then klp relocs can + * be retired. + * + * Return: + * -1: error: validation failed + * 1: warning: tracepoint skipped + * 0: success + */ +static int validate_special_section_klp_reloc(struct elfs *e, struct symbol *sym) +{ + bool static_branch = !strcmp(sym->sec->name, "__jump_table"); + bool static_call = !strcmp(sym->sec->name, ".static_call_sites"); + struct symbol *code_sym = NULL; + unsigned long code_offset = 0; + struct reloc *reloc; + int ret = 0; + + if (!static_branch && !static_call) + return 0; + + sym_for_each_reloc(e->patched, sym, reloc) { + const char *sym_modname; + struct export *export; + + /* Static branch/call keys are always STT_OBJECT */ + if (reloc->sym->type != STT_OBJECT) { + + /* Save code location which can be printed below */ + if (reloc->sym->type == STT_FUNC && !code_sym) { + code_sym = reloc->sym; + code_offset = reloc_addend(reloc); + } + + continue; + } + + if (!klp_reloc_needed(reloc)) + continue; + + export = find_export(reloc->sym); + if (export) { + sym_modname = export->mod; + } else { + sym_modname = find_modname(e); + if (!sym_modname) + return -1; + } + + /* vmlinux keys are ok */ + if (!strcmp(sym_modname, "vmlinux")) + continue; + + if (static_branch) { + if (strstarts(reloc->sym->name, "__tracepoint_")) { + WARN("%s: disabling unsupported tracepoint %s", + code_sym->name, reloc->sym->name + 13); + ret = 1; + continue; + } + + ERROR("%s+0x%lx: unsupported static branch key %s. Use static_key_enabled() instead", + code_sym->name, code_offset, reloc->sym->name); + return -1; + } + + /* static call */ + if (strstarts(reloc->sym->name, "__SCK__tp_func_")) { + ret = 1; + continue; + } + + ERROR("%s()+0x%lx: unsupported static call key %s. Use KLP_STATIC_CALL() instead", + code_sym->name, code_offset, reloc->sym->name); + return -1; + } + + return ret; +} + +static int clone_special_section(struct elfs *e, struct section *patched_sec) +{ + struct symbol *patched_sym; + + /* + * Extract all special section symbols (and their dependencies) which + * reference included functions. + */ + sec_for_each_sym(patched_sec, patched_sym) { + int ret; + + if (!is_object_sym(patched_sym)) + continue; + + if (!should_keep_special_sym(e->patched, patched_sym)) + continue; + + ret = validate_special_section_klp_reloc(e, patched_sym); + if (ret < 0) + return -1; + if (ret > 0) + continue; + + if (!clone_symbol(e, patched_sym, true)) + return -1; + } + + return 0; +} + +/* Extract only the needed bits from special sections */ +static int clone_special_sections(struct elfs *e) +{ + struct section *patched_sec; + + if (create_fake_symbols(e->patched)) + return -1; + + for_each_sec(e->patched, patched_sec) { + if (is_special_section(patched_sec)) { + if (clone_special_section(e, patched_sec)) + return -1; + } + } + + return 0; +} + +/* + * Create __klp_objects and __klp_funcs sections which are intermediate + * sections provided as input to the patch module's init code for building the + * klp_patch, klp_object and klp_func structs for the livepatch API. + */ +static int create_klp_sections(struct elfs *e) +{ + size_t obj_size = sizeof(struct klp_object_ext); + size_t func_size = sizeof(struct klp_func_ext); + struct section *obj_sec, *funcs_sec, *str_sec; + struct symbol *funcs_sym, *str_sym, *sym; + char sym_name[SYM_NAME_LEN]; + unsigned int nr_funcs = 0; + const char *modname; + void *obj_data; + s64 addend; + + obj_sec = elf_create_section_pair(e->out, KLP_OBJECTS_SEC, obj_size, 0, 0); + if (!obj_sec) + return -1; + + funcs_sec = elf_create_section_pair(e->out, KLP_FUNCS_SEC, func_size, 0, 0); + if (!funcs_sec) + return -1; + + funcs_sym = elf_create_section_symbol(e->out, funcs_sec); + if (!funcs_sym) + return -1; + + str_sec = elf_create_section(e->out, KLP_STRINGS_SEC, 0, 0, + SHT_PROGBITS, 1, + SHF_ALLOC | SHF_STRINGS | SHF_MERGE); + if (!str_sec) + return -1; + + if (elf_add_string(e->out, str_sec, "") == -1) + return -1; + + str_sym = elf_create_section_symbol(e->out, str_sec); + if (!str_sym) + return -1; + + /* allocate klp_object_ext */ + obj_data = elf_add_data(e->out, obj_sec, NULL, obj_size); + if (!obj_data) + return -1; + + modname = find_modname(e); + if (!modname) + return -1; + + /* klp_object_ext.name */ + if (strcmp(modname, "vmlinux")) { + addend = elf_add_string(e->out, str_sec, modname); + if (addend == -1) + return -1; + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, name), + str_sym, addend, R_ABS64)) + return -1; + } + + /* klp_object_ext.funcs */ + if (!elf_create_reloc(e->out, obj_sec, offsetof(struct klp_object_ext, funcs), + funcs_sym, 0, R_ABS64)) + return -1; + + for_each_sym(e->out, sym) { + unsigned long offset = nr_funcs * func_size; + unsigned long sympos; + void *func_data; + + if (!is_func_sym(sym) || sym->cold || !sym->clone || !sym->clone->changed) + continue; + + /* allocate klp_func_ext */ + func_data = elf_add_data(e->out, funcs_sec, NULL, func_size); + if (!func_data) + return -1; + + /* klp_func_ext.old_name */ + addend = elf_add_string(e->out, str_sec, sym->clone->twin->name); + if (addend == -1) + return -1; + + if (!elf_create_reloc(e->out, funcs_sec, + offset + offsetof(struct klp_func_ext, old_name), + str_sym, addend, R_ABS64)) + return -1; + + /* klp_func_ext.new_func */ + if (!elf_create_reloc(e->out, funcs_sec, + offset + offsetof(struct klp_func_ext, new_func), + sym, 0, R_ABS64)) + return -1; + + /* klp_func_ext.sympos */ + BUILD_BUG_ON(sizeof(sympos) != sizeof_field(struct klp_func_ext, sympos)); + sympos = find_sympos(e->orig, sym->clone->twin); + if (sympos == ULONG_MAX) + return -1; + memcpy(func_data + offsetof(struct klp_func_ext, sympos), &sympos, + sizeof_field(struct klp_func_ext, sympos)); + + nr_funcs++; + } + + /* klp_object_ext.nr_funcs */ + BUILD_BUG_ON(sizeof(nr_funcs) != sizeof_field(struct klp_object_ext, nr_funcs)); + memcpy(obj_data + offsetof(struct klp_object_ext, nr_funcs), &nr_funcs, + sizeof_field(struct klp_object_ext, nr_funcs)); + + /* + * Find callback pointers created by KLP_PRE_PATCH_CALLBACK() and + * friends, and add them to the klp object. + */ + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_PRE_PATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, pre_patch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_POST_PATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, post_patch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_PRE_UNPATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, pre_unpatch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_POST_UNPATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, post_unpatch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + return 0; +} + +/* + * Copy all .modinfo import_ns= tags to ensure all namespaced exported symbols + * can be accessed via normal relocs. + */ +static int copy_import_ns(struct elfs *e) +{ + struct section *patched_sec, *out_sec = NULL; + char *import_ns, *data_end; + + patched_sec = find_section_by_name(e->patched, ".modinfo"); + if (!patched_sec) + return 0; + + import_ns = patched_sec->data->d_buf; + if (!import_ns) + return 0; + + for (data_end = import_ns + sec_size(patched_sec); + import_ns < data_end; + import_ns += strlen(import_ns) + 1) { + + import_ns = memmem(import_ns, data_end - import_ns, "import_ns=", 10); + if (!import_ns) + return 0; + + if (!out_sec) { + out_sec = find_section_by_name(e->out, ".modinfo"); + if (!out_sec) { + out_sec = elf_create_section(e->out, ".modinfo", 0, + patched_sec->sh.sh_entsize, + patched_sec->sh.sh_type, + patched_sec->sh.sh_addralign, + patched_sec->sh.sh_flags); + if (!out_sec) + return -1; + } + } + + if (!elf_add_data(e->out, out_sec, import_ns, strlen(import_ns) + 1)) + return -1; + } + + return 0; +} + +int cmd_klp_diff(int argc, const char **argv) +{ + struct elfs e = {0}; + + argc = parse_options(argc, argv, klp_diff_options, klp_diff_usage, 0); + if (argc != 3) + usage_with_options(klp_diff_usage, klp_diff_options); + + objname = argv[0]; + + e.orig = elf_open_read(argv[0], O_RDONLY); + e.patched = elf_open_read(argv[1], O_RDONLY); + e.out = NULL; + + if (!e.orig || !e.patched) + return -1; + + if (read_exports()) + return -1; + + if (read_sym_checksums(e.orig)) + return -1; + + if (read_sym_checksums(e.patched)) + return -1; + + if (correlate_symbols(&e)) + return -1; + + if (mark_changed_functions(&e)) + return 0; + + e.out = elf_create_file(&e.orig->ehdr, argv[2]); + if (!e.out) + return -1; + + if (clone_included_functions(&e)) + return -1; + + if (clone_special_sections(&e)) + return -1; + + if (create_klp_sections(&e)) + return -1; + + if (copy_import_ns(&e)) + return -1; + + if (elf_write(e.out)) + return -1; + + return elf_close(e.out); +} diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 5c8b974ad0f9..c8f611c1320d 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -16,8 +16,6 @@ #include #include -bool help; - static struct objtool_file file; struct objtool_file *objtool_open_read(const char *filename) @@ -71,6 +69,39 @@ int objtool_pv_add(struct objtool_file *f, int idx, struct symbol *func) return 0; } +char *top_level_dir(const char *file) +{ + ssize_t len, self_len, file_len; + char self[PATH_MAX], *str; + int i; + + len = readlink("/proc/self/exe", self, sizeof(self) - 1); + if (len <= 0) + return NULL; + self[len] = '\0'; + + for (i = 0; i < 3; i++) { + char *s = strrchr(self, '/'); + if (!s) + return NULL; + *s = '\0'; + } + + self_len = strlen(self); + file_len = strlen(file); + + str = malloc(self_len + file_len + 2); + if (!str) + return NULL; + + memcpy(str, self, self_len); + str[self_len] = '/'; + strcpy(str + self_len + 1, file); + + return str; +} + + int main(int argc, const char **argv) { static const char *UNUSED = "OBJTOOL_NOT_IMPLEMENTED"; @@ -79,5 +110,11 @@ int main(int argc, const char **argv) exec_cmd_init("objtool", UNUSED, UNUSED, UNUSED); pager_init(UNUSED); + if (argc > 1 && !strcmp(argv[1], "klp")) { + argc--; + argv++; + return cmd_klp(argc, argv); + } + return objtool_run(argc, argv); } diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh index 86d64e3ac6f7..e38167ca56a9 100755 --- a/tools/objtool/sync-check.sh +++ b/tools/objtool/sync-check.sh @@ -17,6 +17,7 @@ arch/x86/include/asm/emulate_prefix.h arch/x86/lib/x86-opcode-map.txt arch/x86/tools/gen-insn-attr-x86.awk include/linux/interval_tree_generic.h +include/linux/livepatch_external.h include/linux/static_call_types.h " diff --git a/tools/objtool/weak.c b/tools/objtool/weak.c index d83f607733b0..d6562f292259 100644 --- a/tools/objtool/weak.c +++ b/tools/objtool/weak.c @@ -8,6 +8,8 @@ #include #include #include +#include +#include #define UNSUPPORTED(name) \ ({ \ @@ -24,3 +26,8 @@ int __weak orc_create(struct objtool_file *file) { UNSUPPORTED("ORC"); } + +int __weak cmd_klp(int argc, const char **argv) +{ + UNSUPPORTED("klp"); +} -- cgit v1.2.3 From b9976fa4649627c04dde26183333c3dcc90a0b76 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:04:11 -0700 Subject: livepatch: Introduce source code helpers for livepatch modules Add some helper macros which can be used by livepatch source .patch files to register callbacks, convert static calls to regular calls where needed, and patch syscalls. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/livepatch_helpers.h | 77 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 include/linux/livepatch_helpers.h (limited to 'include/linux') diff --git a/include/linux/livepatch_helpers.h b/include/linux/livepatch_helpers.h new file mode 100644 index 000000000000..99d68d0773fa --- /dev/null +++ b/include/linux/livepatch_helpers.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LIVEPATCH_HELPERS_H +#define _LINUX_LIVEPATCH_HELPERS_H + +/* + * Interfaces for use by livepatch patches + */ + +#include +#include + +#ifdef MODULE +#define KLP_OBJNAME __KBUILD_MODNAME +#else +#define KLP_OBJNAME vmlinux +#endif + +/* Livepatch callback registration */ + +#define KLP_CALLBACK_PTRS ".discard.klp_callback_ptrs" + +#define KLP_PRE_PATCH_CALLBACK(func) \ + klp_pre_patch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_PRE_PATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_POST_PATCH_CALLBACK(func) \ + klp_post_patch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_POST_PATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_PRE_UNPATCH_CALLBACK(func) \ + klp_pre_unpatch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_PRE_UNPATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_POST_UNPATCH_CALLBACK(func) \ + klp_post_unpatch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_POST_UNPATCH_PREFIX, KLP_OBJNAME) = func + +/* + * Replace static_call() usage with this macro when create-diff-object + * recommends it due to the original static call key living in a module. + * + * This converts the static call to a regular indirect call. + */ +#define KLP_STATIC_CALL(name) \ + ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func)) + +/* Syscall patching */ + +#define KLP_SYSCALL_DEFINE1(name, ...) KLP_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE2(name, ...) KLP_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE3(name, ...) KLP_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE4(name, ...) KLP_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE5(name, ...) KLP_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE6(name, ...) KLP_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) + +#define KLP_SYSCALL_DEFINEx(x, sname, ...) \ + __KLP_SYSCALL_DEFINEx(x, sname, __VA_ARGS__) + +#ifdef CONFIG_X86_64 +// TODO move this to arch/x86/include/asm/syscall_wrapper.h and share code +#define __KLP_SYSCALL_DEFINEx(x, name, ...) \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + __X64_SYS_STUBx(x, name, __VA_ARGS__) \ + __IA32_SYS_STUBx(x, name, __VA_ARGS__) \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + long ret = __klp_do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ + return ret; \ + } \ + static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +#endif + +#endif /* _LINUX_LIVEPATCH_HELPERS_H */ -- cgit v1.2.3 From 10c4b4f60f5d0dbd29fa819be76e888501c7b729 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 13 Oct 2025 22:50:27 +0200 Subject: net: mdio: use macro module_driver to avoid boilerplate code Use macro module_driver to avoid boilerplate code. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/e5c37417-4984-4b57-8154-264deef61e0d@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index c640ba44dd6e..42d6d47e445b 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -689,16 +689,7 @@ struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); * init/exit. Each module may only use this macro once, and calling it * replaces module_init() and module_exit(). */ -#define mdio_module_driver(_mdio_driver) \ -static int __init mdio_module_init(void) \ -{ \ - return mdio_driver_register(&_mdio_driver); \ -} \ -module_init(mdio_module_init); \ -static void __exit mdio_module_exit(void) \ -{ \ - mdio_driver_unregister(&_mdio_driver); \ -} \ -module_exit(mdio_module_exit) +#define mdio_module_driver(_mdio_driver) \ + module_driver(_mdio_driver, mdio_driver_register, mdio_driver_unregister) #endif /* __LINUX_MDIO_H__ */ -- cgit v1.2.3 From 433e294c3c5b5d2020085a0e36c1cb47b694690a Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 1 Oct 2025 12:56:49 +0200 Subject: regulator: core: forward undervoltage events downstream by default Forward critical supply events downstream so consumers can react in time. An under-voltage event on an upstream rail may otherwise never reach end devices (e.g. eMMC). Register a notifier on a regulator's supply when the supply is resolved, and forward only REGULATOR_EVENT_UNDER_VOLTAGE to the consumer's notifier chain. Event handling is deferred to process context via a workqueue; the consumer rdev is lifetime-pinned and the rdev lock is held while calling the notifier chain. The notifier is unregistered on regulator teardown. No DT/UAPI changes. Behavior applies to all regulators with a supply. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20251001105650.2391477-1-o.rempel@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/core.c | 124 +++++++++++++++++++++++++++++++++++++++ include/linux/regulator/driver.h | 3 + 2 files changed, 127 insertions(+) (limited to 'include/linux') diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index dd7b10e768c0..84bc38911dba 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -83,6 +83,19 @@ struct regulator_supply_alias { const char *alias_supply; }; +/* + * Work item used to forward regulator events. + * + * @work: workqueue entry + * @rdev: regulator device to notify (consumer receiving the forwarded event) + * @event: event code to be forwarded + */ +struct regulator_event_work { + struct work_struct work; + struct regulator_dev *rdev; + unsigned long event; +}; + static int _regulator_is_enabled(struct regulator_dev *rdev); static int _regulator_disable(struct regulator *regulator); static int _regulator_get_error_flags(struct regulator_dev *rdev, unsigned int *flags); @@ -1658,6 +1671,104 @@ static int set_machine_constraints(struct regulator_dev *rdev) return 0; } +/** + * regulator_event_work_fn - process a deferred regulator event + * @work: work_struct queued by the notifier + * + * Calls the regulator's notifier chain in process context while holding + * the rdev lock, then releases the device reference. + */ +static void regulator_event_work_fn(struct work_struct *work) +{ + struct regulator_event_work *rew = + container_of(work, struct regulator_event_work, work); + struct regulator_dev *rdev = rew->rdev; + int ret; + + regulator_lock(rdev); + ret = regulator_notifier_call_chain(rdev, rew->event, NULL); + regulator_unlock(rdev); + if (ret == NOTIFY_BAD) + dev_err(rdev_get_dev(rdev), "failed to forward regulator event\n"); + + put_device(rdev_get_dev(rdev)); + kfree(rew); +} + +/** + * regulator_event_forward_notifier - notifier callback for supply events + * @nb: notifier block embedded in the regulator + * @event: regulator event code + * @data: unused + * + * Packages the event into a work item and schedules it in process context. + * Takes a reference on @rdev->dev to pin the regulator until the work + * completes (see put_device() in the worker). + * + * Return: NOTIFY_OK on success, NOTIFY_DONE for events that are not forwarded. + */ +static int regulator_event_forward_notifier(struct notifier_block *nb, + unsigned long event, + void __always_unused *data) +{ + struct regulator_dev *rdev = container_of(nb, struct regulator_dev, + supply_fwd_nb); + struct regulator_event_work *rew; + + switch (event) { + case REGULATOR_EVENT_UNDER_VOLTAGE: + break; + default: + /* Only forward allowed events downstream. */ + return NOTIFY_DONE; + } + + rew = kmalloc(sizeof(*rew), GFP_ATOMIC); + if (!rew) + return NOTIFY_DONE; + + get_device(rdev_get_dev(rdev)); + rew->rdev = rdev; + rew->event = event; + INIT_WORK(&rew->work, regulator_event_work_fn); + + queue_work(system_highpri_wq, &rew->work); + + return NOTIFY_OK; +} + +/** + * register_regulator_event_forwarding - enable supply event forwarding + * @rdev: regulator device + * + * Registers a notifier on the regulator's supply so that supply events + * are forwarded to the consumer regulator via the deferred work handler. + * + * Return: 0 on success, -EALREADY if already enabled, or a negative error code. + */ +static int register_regulator_event_forwarding(struct regulator_dev *rdev) +{ + int ret; + + if (!rdev->supply) + return 0; /* top-level regulator: nothing to forward */ + + if (rdev->supply_fwd_nb.notifier_call) + return -EALREADY; + + rdev->supply_fwd_nb.notifier_call = regulator_event_forward_notifier; + + ret = regulator_register_notifier(rdev->supply, &rdev->supply_fwd_nb); + if (ret) { + dev_err(&rdev->dev, "failed to register supply notifier: %pe\n", + ERR_PTR(ret)); + rdev->supply_fwd_nb.notifier_call = NULL; + return ret; + } + + return 0; +} + /** * set_supply - set regulator supply regulator * @rdev: regulator (locked) @@ -2144,6 +2255,16 @@ static int regulator_resolve_supply(struct regulator_dev *rdev) goto out; } + /* + * Automatically register for event forwarding from the new supply. + * This creates the downstream propagation link for events like + * under-voltage. + */ + ret = register_regulator_event_forwarding(rdev); + if (ret < 0) + rdev_warn(rdev, "Failed to register event forwarding: %pe\n", + ERR_PTR(ret)); + regulator_unlock_two(rdev, r, &ww_ctx); /* rdev->supply was created in set_supply() */ @@ -6031,6 +6152,9 @@ void regulator_unregister(struct regulator_dev *rdev) return; if (rdev->supply) { + regulator_unregister_notifier(rdev->supply, + &rdev->supply_fwd_nb); + while (rdev->use_count--) regulator_disable(rdev->supply); regulator_put(rdev->supply); diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 4a216fdba354..978cf593b662 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -658,6 +658,9 @@ struct regulator_dev { spinlock_t err_lock; int pw_requested_mW; + + /* regulator notification forwarding */ + struct notifier_block supply_fwd_nb; }; /* -- cgit v1.2.3 From 48a97ffc6c826640907d13b199e29008f4fe2c15 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 14 Oct 2025 13:14:03 -0700 Subject: bpf: Consistently use bpf_rcu_lock_held() everywhere We have many places which open-code what's now is bpf_rcu_lock_held() macro, so replace all those places with a clean and short macro invocation. For that, move bpf_rcu_lock_held() macro into include/linux/bpf.h. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20251014201403.4104511-1-andrii@kernel.org --- include/linux/bpf.h | 3 +++ include/linux/bpf_local_storage.h | 3 --- kernel/bpf/hashtab.c | 21 +++++++-------------- kernel/bpf/helpers.c | 12 ++++-------- 4 files changed, 14 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f87fb203aaae..86afd9ac6848 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2381,6 +2381,9 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array *array, bool bpf_jit_bypass_spec_v1(void); bool bpf_jit_bypass_spec_v4(void); +#define bpf_rcu_lock_held() \ + (rcu_read_lock_held() || rcu_read_lock_trace_held() || rcu_read_lock_bh_held()) + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index ab7244d8108f..782f58feea35 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -18,9 +18,6 @@ #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 -#define bpf_rcu_lock_held() \ - (rcu_read_lock_held() || rcu_read_lock_trace_held() || \ - rcu_read_lock_bh_held()) struct bpf_local_storage_map_bucket { struct hlist_head list; raw_spinlock_t lock; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e7a6ba04dc82..f876f09355f0 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -657,8 +657,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1086,8 +1085,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1194,8 +1192,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1263,8 +1260,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1326,8 +1322,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1404,8 +1399,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1440,8 +1434,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index dea8443f782c..825280c953be 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -42,8 +42,7 @@ */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -59,8 +58,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -77,8 +75,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_delete_elem(map, key); } @@ -134,8 +131,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } -- cgit v1.2.3 From 378e6523ebb1e80b3955b7675cfe40b07028d085 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 14 Oct 2025 08:02:47 +0200 Subject: net: bcmgenet: remove unused platform code This effectively reverts b0ba512e25d7 ("net: bcmgenet: enable driver to work without a device tree"). There has never been an in-tree user of struct bcmgenet_platform_data, all devices use OF or ACPI. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/108b4e64-55d4-4b4e-9a11-3c810c319d66@gmail.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - drivers/net/ethernet/broadcom/genet/bcmgenet.c | 20 +++---- drivers/net/ethernet/broadcom/genet/bcmmii.c | 75 +------------------------- include/linux/platform_data/bcmgenet.h | 19 ------- 4 files changed, 7 insertions(+), 108 deletions(-) delete mode 100644 include/linux/platform_data/bcmgenet.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 3a27901781c2..4c4b519171f3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5123,7 +5123,6 @@ F: Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml F: drivers/net/ethernet/broadcom/genet/ F: drivers/net/ethernet/broadcom/unimac.h F: drivers/net/mdio/mdio-bcm-unimac.c -F: include/linux/platform_data/bcmgenet.h F: include/linux/platform_data/mdio-bcm-unimac.h BROADCOM IPROC ARM ARCHITECTURE diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 98971ae4f87d..d99ef92feb82 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -35,7 +35,6 @@ #include #include #include -#include #include @@ -3926,7 +3925,6 @@ MODULE_DEVICE_TABLE(of, bcmgenet_match); static int bcmgenet_probe(struct platform_device *pdev) { - struct bcmgenet_platform_data *pd = pdev->dev.platform_data; const struct bcmgenet_plat_data *pdata; struct bcmgenet_priv *priv; struct net_device *dev; @@ -4010,9 +4008,6 @@ static int bcmgenet_probe(struct platform_device *pdev) priv->version = pdata->version; priv->dma_max_burst_length = pdata->dma_max_burst_length; priv->flags = pdata->flags; - } else { - priv->version = pd->genet_version; - priv->dma_max_burst_length = DMA_MAX_BURST_LENGTH; } priv->clk = devm_clk_get_optional(&priv->pdev->dev, "enet"); @@ -4062,16 +4057,13 @@ static int bcmgenet_probe(struct platform_device *pdev) if (device_get_phy_mode(&pdev->dev) == PHY_INTERFACE_MODE_INTERNAL) bcmgenet_power_up(priv, GENET_POWER_PASSIVE); - if (pd && !IS_ERR_OR_NULL(pd->mac_address)) - eth_hw_addr_set(dev, pd->mac_address); - else - if (device_get_ethdev_address(&pdev->dev, dev)) - if (has_acpi_companion(&pdev->dev)) { - u8 addr[ETH_ALEN]; + if (device_get_ethdev_address(&pdev->dev, dev)) + if (has_acpi_companion(&pdev->dev)) { + u8 addr[ETH_ALEN]; - bcmgenet_get_hw_addr(priv, addr); - eth_hw_addr_set(dev, addr); - } + bcmgenet_get_hw_addr(priv, addr); + eth_hw_addr_set(dev, addr); + } if (!is_valid_ether_addr(dev->dev_addr)) { dev_warn(&pdev->dev, "using random Ethernet MAC\n"); diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index 573e8b279e52..38f854b94a79 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include "bcmgenet.h" @@ -436,23 +435,6 @@ static struct device_node *bcmgenet_mii_of_find_mdio(struct bcmgenet_priv *priv) return priv->mdio_dn; } -static void bcmgenet_mii_pdata_init(struct bcmgenet_priv *priv, - struct unimac_mdio_pdata *ppd) -{ - struct device *kdev = &priv->pdev->dev; - struct bcmgenet_platform_data *pd = kdev->platform_data; - - if (pd->phy_interface != PHY_INTERFACE_MODE_MOCA && pd->mdio_enabled) { - /* - * Internal or external PHY with MDIO access - */ - if (pd->phy_address >= 0 && pd->phy_address < PHY_MAX_ADDR) - ppd->phy_mask = 1 << pd->phy_address; - else - ppd->phy_mask = 0; - } -} - static int bcmgenet_mii_wait(void *wait_func_data) { struct bcmgenet_priv *priv = wait_func_data; @@ -467,7 +449,6 @@ static int bcmgenet_mii_wait(void *wait_func_data) static int bcmgenet_mii_register(struct bcmgenet_priv *priv) { struct platform_device *pdev = priv->pdev; - struct bcmgenet_platform_data *pdata = pdev->dev.platform_data; struct device_node *dn = pdev->dev.of_node; struct unimac_mdio_pdata ppd; struct platform_device *ppdev; @@ -511,8 +492,6 @@ static int bcmgenet_mii_register(struct bcmgenet_priv *priv) ppdev->dev.parent = &pdev->dev; if (dn) ppdev->dev.of_node = bcmgenet_mii_of_find_mdio(priv); - else if (pdata) - bcmgenet_mii_pdata_init(priv, &ppd); else ppd.phy_mask = ~0; @@ -594,58 +573,6 @@ static int bcmgenet_mii_of_init(struct bcmgenet_priv *priv) return 0; } -static int bcmgenet_mii_pd_init(struct bcmgenet_priv *priv) -{ - struct device *kdev = &priv->pdev->dev; - struct bcmgenet_platform_data *pd = kdev->platform_data; - char phy_name[MII_BUS_ID_SIZE + 3]; - char mdio_bus_id[MII_BUS_ID_SIZE]; - struct phy_device *phydev; - - snprintf(mdio_bus_id, MII_BUS_ID_SIZE, "%s-%d", - UNIMAC_MDIO_DRV_NAME, priv->pdev->id); - - if (pd->phy_interface != PHY_INTERFACE_MODE_MOCA && pd->mdio_enabled) { - snprintf(phy_name, MII_BUS_ID_SIZE, PHY_ID_FMT, - mdio_bus_id, pd->phy_address); - - /* - * Internal or external PHY with MDIO access - */ - phydev = phy_attach(priv->dev, phy_name, pd->phy_interface); - if (IS_ERR(phydev)) { - dev_err(kdev, "failed to register PHY device\n"); - return PTR_ERR(phydev); - } - } else { - /* - * MoCA port or no MDIO access. - * Use fixed PHY to represent the link layer. - */ - struct fixed_phy_status fphy_status = { - .link = 1, - .speed = pd->phy_speed, - .duplex = pd->phy_duplex, - .pause = 0, - .asym_pause = 0, - }; - - phydev = fixed_phy_register(&fphy_status, NULL); - if (IS_ERR(phydev)) { - dev_err(kdev, "failed to register fixed PHY device\n"); - return PTR_ERR(phydev); - } - - /* Make sure we initialize MoCA PHYs with a link down */ - phydev->link = 0; - - } - - priv->phy_interface = pd->phy_interface; - - return 0; -} - static int bcmgenet_mii_bus_init(struct bcmgenet_priv *priv) { struct device *kdev = &priv->pdev->dev; @@ -656,7 +583,7 @@ static int bcmgenet_mii_bus_init(struct bcmgenet_priv *priv) else if (has_acpi_companion(kdev)) return bcmgenet_phy_interface_init(priv); else - return bcmgenet_mii_pd_init(priv); + return -EINVAL; } int bcmgenet_mii_init(struct net_device *dev) diff --git a/include/linux/platform_data/bcmgenet.h b/include/linux/platform_data/bcmgenet.h deleted file mode 100644 index d8f8738629d2..000000000000 --- a/include/linux/platform_data/bcmgenet.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_PLATFORM_DATA_BCMGENET_H__ -#define __LINUX_PLATFORM_DATA_BCMGENET_H__ - -#include -#include -#include - -struct bcmgenet_platform_data { - bool mdio_enabled; - phy_interface_t phy_interface; - int phy_address; - int phy_speed; - int phy_duplex; - u8 mac_address[ETH_ALEN]; - int genet_version; -}; - -#endif -- cgit v1.2.3 From 44f5c8ec5b9ad8ed4ade08d727f803b2bb07f1c3 Mon Sep 17 00:00:00 2001 From: Ryan Newton Date: Wed, 15 Oct 2025 11:50:35 -0400 Subject: sched_ext: Add lockless peek operation for DSQs The builtin DSQ queue data structures are meant to be used by a wide range of different sched_ext schedulers with different demands on these data structures. They might be per-cpu with low-contention, or high-contention shared queues. Unfortunately, DSQs have a coarse-grained lock around the whole data structure. Without going all the way to a lock-free, more scalable implementation, a small step we can take to reduce lock contention is to allow a lockless, small-fixed-cost peek at the head of the queue. This change allows certain custom SCX schedulers to cheaply peek at queues, e.g. during load balancing, before locking them. But it represents a few extra memory operations to update the pointer each time the DSQ is modified, including a memory barrier on ARM so the write appears correctly ordered. This commit adds a first_task pointer field which is updated atomically when the DSQ is modified, and allows any thread to peek at the head of the queue without holding the lock. Signed-off-by: Ryan Newton Reviewed-by: Andrea Righi Reviewed-by: Christian Loehle Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 58 ++++++++++++++++++++++++++++++-- tools/sched_ext/include/scx/common.bpf.h | 1 + tools/sched_ext/include/scx/compat.bpf.h | 18 ++++++++++ 4 files changed, 76 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 9848aeab2786..4713f374acc0 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -58,6 +58,7 @@ enum scx_dsq_id_flags { */ struct scx_dispatch_q { raw_spinlock_t lock; + struct task_struct __rcu *first_task; /* lockless peek at head */ struct list_head list; /* tasks in dispatch order */ struct rb_root priq; /* used to order by p->scx.dsq_vtime */ u32 nr; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 430749ce46ab..f9c0888ef279 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -965,8 +965,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, container_of(rbp, struct task_struct, scx.dsq_priq); list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); + /* first task unchanged - no update needed */ } else { list_add(&p->scx.dsq_list.node, &dsq->list); + /* not builtin and new task is at head - use fastpath */ + rcu_assign_pointer(dsq->first_task, p); } } else { /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ @@ -974,10 +977,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", dsq->id); - if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { list_add(&p->scx.dsq_list.node, &dsq->list); - else + /* new task inserted at head - use fastpath */ + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } else { + bool was_empty; + + was_empty = list_empty(&dsq->list); list_add_tail(&p->scx.dsq_list.node, &dsq->list); + if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } } /* seq records the order tasks are queued, used by BPF DSQ iterator */ @@ -1032,6 +1044,13 @@ static void task_unlink_from_dsq(struct task_struct *p, p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; } + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { + struct task_struct *first_task; + + first_task = nldsq_next_task(dsq, NULL, false); + rcu_assign_pointer(dsq->first_task, first_task); + } + list_del_init(&p->scx.dsq_list.node); dsq_mod_nr(dsq, -1); } @@ -6292,6 +6311,40 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) kit->dsq = NULL; } +/** + * scx_bpf_dsq_peek - Lockless peek at the first element. + * @dsq_id: DSQ to examine. + * + * Read the first element in the DSQ. This is semantically equivalent to using + * the DSQ iterator, but is lockfree. Of course, like any lockless operation, + * this provides only a point-in-time snapshot, and the contents may change + * by the time any subsequent locking operation reads the queue. + * + * Returns the pointer, or NULL indicates an empty queue OR internal error. + */ +__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) +{ + struct scx_sched *sch; + struct scx_dispatch_q *dsq; + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { + scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); + return NULL; + } + + dsq = find_user_dsq(sch, dsq_id); + if (unlikely(!dsq)) { + scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); + return NULL; + } + + return rcu_dereference(dsq->first_task); +} + __bpf_kfunc_end_defs(); static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, @@ -6851,6 +6904,7 @@ BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index eb3c99445cb3..e65b1eb668ea 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -74,6 +74,7 @@ u32 scx_bpf_reenqueue_local(void) __ksym; void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak; int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index e487c10b5e07..619a16f0d39a 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -26,6 +26,24 @@ int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym (bpf_ksym_exists(bpf_cpumask_populate) ? \ (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) +/* + * v6.19: Introduce lockless peek API for user DSQs. + * + * Preserve the following macro until v6.21. + */ +static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id) +{ + struct task_struct *p = NULL; + struct bpf_iter_scx_dsq it; + + if (bpf_ksym_exists(scx_bpf_dsq_peek)) + return scx_bpf_dsq_peek(dsq_id); + if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0)) + p = bpf_iter_scx_dsq_next(&it); + bpf_iter_scx_dsq_destroy(&it); + return p; +} + /** * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on * in a compatible way. We will preserve this __COMPAT helper until v6.16. -- cgit v1.2.3 From c97513cddcfc235f2522617980838e500af21d01 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Tue, 9 Sep 2025 22:52:43 +0800 Subject: hung_task: fix warnings caused by unaligned lock pointers The blocker tracking mechanism assumes that lock pointers are at least 4-byte aligned to use their lower bits for type encoding. However, as reported by Eero Tamminen, some architectures like m68k only guarantee 2-byte alignment of 32-bit values. This breaks the assumption and causes two related WARN_ON_ONCE checks to trigger. To fix this, the runtime checks are adjusted to silently ignore any lock that is not 4-byte aligned, effectively disabling the feature in such cases and avoiding the related warnings. Thanks to Geert Uytterhoeven for bisecting! Link: https://lkml.kernel.org/r/20250909145243.17119-1-lance.yang@linux.dev Fixes: e711faaafbe5 ("hung_task: replace blocker_mutex with encoded blocker") Signed-off-by: Lance Yang Reported-by: Eero Tamminen Closes: https://lore.kernel.org/lkml/CAMuHMdW7Ab13DdGs2acMQcix5ObJK0O2dG_Fxzr8_g58Rc1_0g@mail.gmail.com Reviewed-by: Masami Hiramatsu (Google) Cc: John Paul Adrian Glaubitz Cc: Anna Schumaker Cc: Boqun Feng Cc: Finn Thain Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Lance Yang Cc: Mingzhe Yang Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Signed-off-by: Andrew Morton --- include/linux/hung_task.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hung_task.h b/include/linux/hung_task.h index 34e615c76ca5..c4403eeb7144 100644 --- a/include/linux/hung_task.h +++ b/include/linux/hung_task.h @@ -20,6 +20,10 @@ * always zero. So we can use these bits to encode the specific blocking * type. * + * Note that on architectures where this is not guaranteed, or for any + * unaligned lock, this tracking mechanism is silently skipped for that + * lock. + * * Type encoding: * 00 - Blocked on mutex (BLOCKER_TYPE_MUTEX) * 01 - Blocked on semaphore (BLOCKER_TYPE_SEM) @@ -45,7 +49,7 @@ static inline void hung_task_set_blocker(void *lock, unsigned long type) * If the lock pointer matches the BLOCKER_TYPE_MASK, return * without writing anything. */ - if (WARN_ON_ONCE(lock_ptr & BLOCKER_TYPE_MASK)) + if (lock_ptr & BLOCKER_TYPE_MASK) return; WRITE_ONCE(current->blocker, lock_ptr | type); @@ -53,8 +57,6 @@ static inline void hung_task_set_blocker(void *lock, unsigned long type) static inline void hung_task_clear_blocker(void) { - WARN_ON_ONCE(!READ_ONCE(current->blocker)); - WRITE_ONCE(current->blocker, 0UL); } -- cgit v1.2.3 From e9139f765ac7048cadc9981e962acdf8b08eabf3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Oct 2024 13:43:43 +0100 Subject: sched: Employ sched_change guards As proposed a long while ago -- and half done by scx -- wrap the scheduler's 'change' pattern in a guard helper. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Acked-by: Tejun Heo Acked-by: Vincent Guittot --- include/linux/cleanup.h | 5 ++ kernel/sched/core.c | 159 +++++++++++++++++++----------------------------- kernel/sched/ext.c | 39 ++++++------ kernel/sched/sched.h | 33 +++++++--- kernel/sched/syscalls.c | 65 +++++++------------- 5 files changed, 131 insertions(+), 170 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..ae381675455d 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -340,6 +340,11 @@ _label: \ #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond +#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return (void *)1; } + #define __GUARD_IS_ERR(_ptr) \ ({ \ unsigned long _rc = (__force unsigned long)(_ptr); \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 198d2dd45f59..eca40df4b6d3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7326,7 +7326,7 @@ void rt_mutex_post_schedule(void) */ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int prio, oldprio, queued, running, queue_flag = + int prio, oldprio, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class, *next_class; struct rq_flags rf; @@ -7391,52 +7391,42 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (prev_class != next_class && p->se.sched_delayed) dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flag); - if (running) - put_prev_task(rq, p); - - /* - * Boosting condition are: - * 1. -rt task is running and holds mutex A - * --> -dl task blocks on mutex A - * - * 2. -dl task is running and holds mutex A - * --> -dl task blocks on mutex A and could preempt the - * running task - */ - if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || - (pi_task && dl_prio(pi_task->prio) && - dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.pi_se = pi_task->dl.pi_se; - queue_flag |= ENQUEUE_REPLENISH; + scoped_guard (sched_change, p, queue_flag) { + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { + p->dl.pi_se = pi_task->dl.pi_se; + scope->flags |= ENQUEUE_REPLENISH; + } else { + p->dl.pi_se = &p->dl; + } + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (oldprio < prio) + scope->flags |= ENQUEUE_HEAD; } else { - p->dl.pi_se = &p->dl; + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (rt_prio(oldprio)) + p->rt.timeout = 0; } - } else if (rt_prio(prio)) { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (oldprio < prio) - queue_flag |= ENQUEUE_HEAD; - } else { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (rt_prio(oldprio)) - p->rt.timeout = 0; - } - p->sched_class = next_class; - p->prio = prio; + p->sched_class = next_class; + p->prio = prio; - check_class_changing(rq, p, prev_class); - - if (queued) - enqueue_task(rq, p, queue_flag); - if (running) - set_next_task(rq, p); + check_class_changing(rq, p, prev_class); + } check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -8084,26 +8074,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - bool queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(p, &rf); - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); - if (running) - put_prev_task(rq, p); - - p->numa_preferred_nid = nid; - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - task_rq_unlock(rq, p, &rf); + guard(task_rq_lock)(p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->numa_preferred_nid = nid; } #endif /* CONFIG_NUMA_BALANCING */ @@ -9205,8 +9178,9 @@ static void sched_change_group(struct task_struct *tsk) */ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { - int queued, running, queue_flags = + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + bool resched = false; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); @@ -9214,29 +9188,16 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) update_rq_clock(rq); - running = task_current_donor(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, queue_flags); - if (running) - put_prev_task(rq, tsk); - - sched_change_group(tsk); - if (!for_autogroup) - scx_cgroup_move_task(tsk); + scoped_guard (sched_change, tsk, queue_flags) { + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); + if (scope->running) + resched = true; + } - if (queued) - enqueue_task(rq, tsk, queue_flags); - if (running) { - set_next_task(rq, tsk); - /* - * After changing group, the running task may have joined a - * throttled one but it's still the running task. Trigger a - * resched to make sure that task can still run. - */ + if (resched) resched_curr(rq); - } } static struct cgroup_subsys_state * @@ -10892,37 +10853,39 @@ void sched_mm_cid_fork(struct task_struct *t) } #endif /* CONFIG_SCHED_MM_CID */ -#ifdef CONFIG_SCHED_CLASS_EXT -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx) +static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); + +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) { + struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq); - *ctx = (struct sched_enq_and_set_ctx){ + *ctx = (struct sched_change_ctx){ .p = p, - .queue_flags = queue_flags, + .flags = flags, .queued = task_on_rq_queued(p), - .running = task_current(rq, p), + .running = task_current_donor(rq, p), }; - update_rq_clock(rq); if (ctx->queued) - dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + dequeue_task(rq, p, flags); if (ctx->running) put_prev_task(rq, p); + + return ctx; } -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +void sched_change_end(struct sched_change_ctx *ctx) { - struct rq *rq = task_rq(ctx->p); + struct task_struct *p = ctx->p; + struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq); if (ctx->queued) - enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK); if (ctx->running) - set_next_task(rq, ctx->p); + set_next_task(rq, p); } -#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2b0e88206d07..4566a7c81360 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3780,11 +3780,10 @@ static void scx_bypass(bool bypass) */ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, scx.runnable_node) { - struct sched_enq_and_set_ctx ctx; - /* cycling deq/enq is enough, see the function comment */ - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* nothing */ ; + } } /* resched to restore ticks and idle state */ @@ -3916,17 +3915,16 @@ static void scx_disable_workfn(struct kthread_work *work) const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + update_rq_clock(task_rq(p)); - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) { + p->sched_class = new_class; + check_class_changing(task_rq(p), p, old_class); + } check_class_changed(task_rq(p), p, old_class, p->prio); scx_exit_task(p); @@ -4660,21 +4658,20 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; if (!tryget_task_struct(p)) continue; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + update_rq_clock(task_rq(p)); - p->scx.slice = SCX_SLICE_DFL; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) { + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; + check_class_changing(task_rq(p), p, old_class); + } check_class_changed(task_rq(p), p, old_class, p->prio); put_task_struct(p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1f5d07067f60..6546849aa075 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3885,23 +3885,38 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p, extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#ifdef CONFIG_SCHED_CLASS_EXT /* - * Used by SCX in the enable/disable paths to move tasks between sched_classes - * and establish invariants. + * The 'sched_change' pattern is the safe, easy and slow way of changing a + * task's scheduling properties. It dequeues a task, such that the scheduler + * is fully unaware of it; at which point its properties can be modified; + * after which it is enqueued again. + * + * Typically this must be called while holding task_rq_lock, since most/all + * properties are serialized under those locks. There is currently one + * exception to this rule in sched/ext which only holds rq->lock. + */ + +/* + * This structure is a temporary, used to preserve/convey the queueing state + * of the task between sched_change_begin() and sched_change_end(). Ensuring + * the task's queueing state is idempotent across the operation. */ -struct sched_enq_and_set_ctx { +struct sched_change_ctx { struct task_struct *p; - int queue_flags; + int flags; bool queued; bool running; }; -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx); -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags); +void sched_change_end(struct sched_change_ctx *ctx); -#endif /* CONFIG_SCHED_CLASS_EXT */ +DEFINE_CLASS(sched_change, struct sched_change_ctx *, + sched_change_end(_T), + sched_change_begin(p, flags), + struct task_struct *p, unsigned int flags) + +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change) #include "ext.h" diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 77ae87f36e84..09ffe91410b1 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -64,7 +64,6 @@ static int effective_prio(struct task_struct *p) void set_user_nice(struct task_struct *p, long nice) { - bool queued, running; struct rq *rq; int old_prio; @@ -90,22 +89,12 @@ void set_user_nice(struct task_struct *p, long nice) return; } - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) { + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + } /* * If the task increased its priority or is running and @@ -515,7 +504,7 @@ int __sched_setscheduler(struct task_struct *p, bool user, bool pi) { int oldpolicy = -1, policy = attr->sched_policy; - int retval, oldprio, newprio, queued, running; + int retval, oldprio, newprio; const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; @@ -698,33 +687,25 @@ change: if (prev_class != next_class && p->se.sched_delayed) dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); - - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { - __setscheduler_params(p, attr); - p->sched_class = next_class; - p->prio = newprio; - } - __setscheduler_uclamp(p, attr); - check_class_changing(rq, p, prev_class); + scoped_guard (sched_change, p, queue_flags) { - if (queued) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - if (oldprio < p->prio) - queue_flags |= ENQUEUE_HEAD; + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + p->sched_class = next_class; + p->prio = newprio; + } + __setscheduler_uclamp(p, attr); + check_class_changing(rq, p, prev_class); - enqueue_task(rq, p, queue_flags); + if (scope->queued) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio < p->prio) + scope->flags |= ENQUEUE_HEAD; + } } - if (running) - set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); -- cgit v1.2.3 From b079d93796528053cde322f2ca838c2d21c297e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Sep 2025 10:08:05 +0200 Subject: sched: Rename do_set_cpus_allowed() Hopefully saner naming. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Acked-by: Tejun Heo Acked-by: Vincent Guittot --- include/linux/sched.h | 4 ++-- kernel/cgroup/cpuset.c | 2 +- kernel/kthread.c | 4 ++-- kernel/sched/core.c | 16 ++++++++-------- kernel/sched/sched.h | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..77426c347cff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1861,8 +1861,8 @@ extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); -/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ -extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); +/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ +extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 52468d2c178a..185e820cd1df 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4180,7 +4180,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { - do_set_cpus_allowed(tsk, cs_mask); + set_cpus_allowed_force(tsk, cs_mask); changed = true; } rcu_read_unlock(); diff --git a/kernel/kthread.c b/kernel/kthread.c index 832bd2afecc6..99a3808d086f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -599,7 +599,7 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas } scoped_guard (raw_spinlock_irqsave, &p->pi_lock) - do_set_cpus_allowed(p, mask); + set_cpus_allowed_force(p, mask); /* It's safe because the task is inactive. */ p->flags |= PF_NO_SETAFFINITY; @@ -880,7 +880,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) kthread_fetch_affinity(kthread, affinity); scoped_guard (raw_spinlock_irqsave, &p->pi_lock) - do_set_cpus_allowed(p, affinity); + set_cpus_allowed_force(p, affinity); mutex_unlock(&kthreads_hotplug_lock); out: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 805e65007e62..638bffd4c1a2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2331,7 +2331,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { @@ -2348,7 +2348,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) scoped_guard (task_rq_lock, p) { update_rq_clock(scope.rq); - __do_set_cpus_allowed(p, &ac); + do_set_cpus_allowed(p, &ac); } } @@ -2662,7 +2662,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { struct rq *rq = task_rq(p); bool queued, running; @@ -2692,7 +2692,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) * Used for kthread_bind() and select_fallback_rq(), in both cases the user * affinity (if any) should be destroyed too. */ -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask) { struct affinity_context ac = { .new_mask = new_mask, @@ -2706,7 +2706,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) scoped_guard (__task_rq_lock, p) { update_rq_clock(scope.rq); - __do_set_cpus_allowed(p, &ac); + do_set_cpus_allowed(p, &ac); } /* @@ -2745,7 +2745,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, * Use pi_lock to protect content of user_cpus_ptr * * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent - * do_set_cpus_allowed(). + * set_cpus_allowed_force(). */ raw_spin_lock_irqsave(&src->pi_lock, flags); if (src->user_cpus_ptr) { @@ -3073,7 +3073,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, goto out; } - __do_set_cpus_allowed(p, ctx); + do_set_cpus_allowed(p, ctx); return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); @@ -3482,7 +3482,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: - do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); + set_cpus_allowed_force(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b23ce9c77611..ea2ea8fd6505 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2617,7 +2617,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) static inline cpumask_t *alloc_user_cpus_ptr(int node) { /* - * See do_set_cpus_allowed() above for the rcu_head usage. + * See set_cpus_allowed_force() above for the rcu_head usage. */ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); -- cgit v1.2.3 From 50653216e4ff7a74c95b2ee9ec439916875556ec Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sat, 9 Aug 2025 14:47:50 -0400 Subject: sched: Add support to pick functions to take rf Some pick functions like the internal pick_next_task_fair() already take rf but some others dont. We need this for scx's server pick function. Prepare for this by having pick functions accept it. [peterz: - added RETRY_TASK handling - removed pick_next_task_fair indirection] Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo --- include/linux/sched.h | 7 ++----- kernel/sched/core.c | 35 ++++++++++++++++++++++++++--------- kernel/sched/deadline.c | 8 ++++---- kernel/sched/ext.c | 2 +- kernel/sched/fair.c | 26 ++++++-------------------- kernel/sched/idle.c | 2 +- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 10 ++++++---- kernel/sched/stop_task.c | 2 +- 9 files changed, 48 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 77426c347cff..07576479c0ed 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -637,8 +637,8 @@ struct sched_rt_entity { #endif } __randomize_layout; -typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); -typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); +struct rq_flags; +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); struct sched_dl_entity { struct rb_node rb_node; @@ -730,9 +730,6 @@ struct sched_dl_entity { * dl_server_update(). * * @rq the runqueue this server is for - * - * @server_has_tasks() returns true if @server_pick return a - * runnable task. */ struct rq *rq; dl_server_pick_f server_pick_task; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9fc990ff6845..a75d45680f9c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5901,7 +5901,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assume the next prioritized class is idle_sched_class */ if (!p) { - p = pick_task_idle(rq); + p = pick_task_idle(rq, rf); put_prev_set_next_task(rq, prev, p); } @@ -5913,11 +5913,15 @@ restart: for_each_active_class(class) { if (class->pick_next_task) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) return p; } else { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) { put_prev_set_next_task(rq, prev, p); return p; @@ -5947,7 +5951,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b) return a->core_cookie == b->core_cookie; } -static inline struct task_struct *pick_task(struct rq *rq) +/* + * Careful; this can return RETRY_TASK, it does not include the retry-loop + * itself due to the whole SMT pick retry thing below. + */ +static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -5955,7 +5963,7 @@ static inline struct task_struct *pick_task(struct rq *rq) rq->dl_server = NULL; for_each_active_class(class) { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); if (p) return p; } @@ -5970,7 +5978,7 @@ static void queue_core_balance(struct rq *rq); static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *next, *p, *max = NULL; + struct task_struct *next, *p, *max; const struct cpumask *smt_mask; bool fi_before = false; bool core_clock_updated = (rq == rq->core); @@ -6055,7 +6063,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * and there are no cookied tasks running on siblings. */ if (!need_sync) { - next = pick_task(rq); +restart_single: + next = pick_task(rq, rf); + if (unlikely(next == RETRY_TASK)) + goto restart_single; if (!next->core_cookie) { rq->core_pick = NULL; rq->core_dl_server = NULL; @@ -6075,6 +6086,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * Tie-break prio towards the current CPU */ +restart_multi: + max = NULL; for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); @@ -6086,7 +6099,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); - rq_i->core_pick = p = pick_task(rq_i); + p = pick_task(rq_i, rf); + if (unlikely(p == RETRY_TASK)) + goto restart_multi; + + rq_i->core_pick = p; rq_i->core_dl_server = rq_i->dl_server; if (!max || prio_less(max, p, fi_before)) @@ -6108,7 +6125,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (cookie) p = sched_core_find(rq_i, cookie); if (!p) - p = idle_sched_class.pick_task(rq_i); + p = idle_sched_class.pick_task(rq_i, rf); } rq_i->core_pick = p; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 83e6175d79f5..48357d4609bf 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2352,7 +2352,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) * __pick_next_task_dl - Helper to pick the next -deadline task to run. * @rq: The runqueue to pick the next task from. */ -static struct task_struct *__pick_task_dl(struct rq *rq) +static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2366,7 +2366,7 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick_task(dl_se); + p = dl_se->server_pick_task(dl_se, rf); if (!p) { dl_server_stop(dl_se); goto again; @@ -2379,9 +2379,9 @@ again: return p; } -static struct task_struct *pick_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf) { - return __pick_task_dl(rq); + return __pick_task_dl(rq, rf); } static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 949c3a6e24d4..dc743cac59cb 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2332,7 +2332,7 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq) +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) { struct task_struct *prev = rq->curr; struct task_struct *p; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 23ac05cca4a4..2554055c1ba1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8705,15 +8705,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context set_task_max_allowed_capacity(p); } -static int -balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - if (sched_fair_runnable(rq)) - return 1; - - return sched_balance_newidle(rq, rf) != 0; -} - static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { @@ -8822,7 +8813,7 @@ preempt: resched_curr_lazy(rq); } -static struct task_struct *pick_task_fair(struct rq *rq) +static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) { struct sched_entity *se; struct cfs_rq *cfs_rq; @@ -8866,7 +8857,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - p = pick_task_fair(rq); + p = pick_task_fair(rq, rf); if (!p) goto idle; se = &p->se; @@ -8945,14 +8936,10 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) -{ - return pick_next_task_fair(rq, prev, NULL); -} - -static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) +static struct task_struct * +fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) { - return pick_task_fair(dl_se->rq); + return pick_task_fair(dl_se->rq, rf); } void fair_server_init(struct rq *rq) @@ -13644,11 +13631,10 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = check_preempt_wakeup_fair, .pick_task = pick_task_fair, - .pick_next_task = __pick_next_task_fair, + .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, - .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 055b0ddbcd54..7fa0b593bcff 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -466,7 +466,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir next->se.exec_start = rq_clock_task(rq); } -struct task_struct *pick_task_idle(struct rq *rq) +struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) { scx_update_idle(rq, true, false); return rq->idle; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9bc828d59121..1fd97f2d7ec6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1695,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct *pick_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf) { struct task_struct *p; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f4a323007dce..8946294929a4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2470,7 +2470,7 @@ struct sched_class { /* * schedule/pick_next_task: rq->lock */ - struct task_struct *(*pick_task)(struct rq *rq); + struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); /* * Optional! When implemented pick_next_task() should be equivalent to: * @@ -2480,7 +2480,8 @@ struct sched_class { * set_next_task_first(next); * } */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); /* * sched_change: @@ -2707,8 +2708,9 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_queued > 0; } -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_task_idle(struct rq *rq); +extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); +extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d98c453c9b4e..4f9192be4b5b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -32,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct *pick_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf) { if (!sched_stop_runnable(rq)) return NULL; -- cgit v1.2.3 From ae495810cffe29c3c30a757bd48b0bb035fc3098 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 14 Oct 2025 18:53:53 +0300 Subject: gpio: regmap: add the .fixed_direction_output configuration parameter There are GPIO controllers such as the one present in the LX2160ARDB QIXIS FPGA which have fixed-direction input and output GPIO lines mixed together in a single register. This cannot be modeled using the gpio-regmap as-is since there is no way to present the true direction of a GPIO line. In order to make this use case possible, add a new configuration parameter - fixed_direction_output - into the gpio_regmap_config structure. This will enable user drivers to provide a bitmap that represents the fixed direction of the GPIO lines. Signed-off-by: Ioana Ciornei Acked-by: Bartosz Golaszewski Reviewed-by: Michael Walle Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-regmap.c | 26 ++++++++++++++++++++++++-- include/linux/gpio/regmap.h | 5 +++++ 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c index ab9e4077fa60..f4267af00027 100644 --- a/drivers/gpio/gpio-regmap.c +++ b/drivers/gpio/gpio-regmap.c @@ -31,6 +31,7 @@ struct gpio_regmap { unsigned int reg_clr_base; unsigned int reg_dir_in_base; unsigned int reg_dir_out_base; + unsigned long *fixed_direction_output; #ifdef CONFIG_REGMAP_IRQ int regmap_irq_line; @@ -134,6 +135,13 @@ static int gpio_regmap_get_direction(struct gpio_chip *chip, unsigned int base, val, reg, mask; int invert, ret; + if (gpio->fixed_direction_output) { + if (test_bit(offset, gpio->fixed_direction_output)) + return GPIO_LINE_DIRECTION_OUT; + else + return GPIO_LINE_DIRECTION_IN; + } + if (gpio->reg_dat_base && !gpio->reg_set_base) return GPIO_LINE_DIRECTION_IN; if (gpio->reg_set_base && !gpio->reg_dat_base) @@ -284,6 +292,17 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config goto err_free_gpio; } + if (config->fixed_direction_output) { + gpio->fixed_direction_output = bitmap_alloc(chip->ngpio, + GFP_KERNEL); + if (!gpio->fixed_direction_output) { + ret = -ENOMEM; + goto err_free_gpio; + } + bitmap_copy(gpio->fixed_direction_output, + config->fixed_direction_output, chip->ngpio); + } + /* if not set, assume there is only one register */ gpio->ngpio_per_reg = config->ngpio_per_reg; if (!gpio->ngpio_per_reg) @@ -300,7 +319,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config ret = gpiochip_add_data(chip, gpio); if (ret < 0) - goto err_free_gpio; + goto err_free_bitmap; #ifdef CONFIG_REGMAP_IRQ if (config->regmap_irq_chip) { @@ -309,7 +328,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config config->regmap_irq_line, config->regmap_irq_flags, 0, config->regmap_irq_chip, &gpio->irq_chip_data); if (ret) - goto err_free_gpio; + goto err_free_bitmap; irq_domain = regmap_irq_get_domain(gpio->irq_chip_data); } else @@ -326,6 +345,8 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config err_remove_gpiochip: gpiochip_remove(chip); +err_free_bitmap: + bitmap_free(gpio->fixed_direction_output); err_free_gpio: kfree(gpio); return ERR_PTR(ret); @@ -344,6 +365,7 @@ void gpio_regmap_unregister(struct gpio_regmap *gpio) #endif gpiochip_remove(&gpio->gpio_chip); + bitmap_free(gpio->fixed_direction_output); kfree(gpio); } EXPORT_SYMBOL_GPL(gpio_regmap_unregister); diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index 622a2939ebe0..87983a5f3681 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -38,6 +38,10 @@ struct regmap; * offset to a register/bitmask pair. If not * given the default gpio_regmap_simple_xlate() * is used. + * @fixed_direction_output: + * (Optional) Bitmap representing the fixed direction of + * the GPIO lines. Useful when there are GPIO lines with a + * fixed direction mixed together in the same register. * @drvdata: (Optional) Pointer to driver specific data which is * not used by gpio-remap but is provided "as is" to the * driver callback(s). @@ -85,6 +89,7 @@ struct gpio_regmap_config { int reg_stride; int ngpio_per_reg; struct irq_domain *irq_domain; + unsigned long *fixed_direction_output; #ifdef CONFIG_REGMAP_IRQ struct regmap_irq_chip *regmap_irq_chip; -- cgit v1.2.3 From eba11116f39533d2e38cc5898014f2c95f32d23a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 13 Oct 2025 15:07:15 +0200 Subject: gpiolib: of: Get rid of Last user of linux/gpio/legacy-of-mm-gpiochip.h is gone. Remove linux/gpio/legacy-of-mm-gpiochip.h and CONFIG_OF_GPIO_MM_GPIOCHIP Signed-off-by: Christophe Leroy Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 8 --- drivers/gpio/TODO | 11 ----- drivers/gpio/gpiolib-of.c | 79 ------------------------------ include/linux/gpio/legacy-of-mm-gpiochip.h | 36 -------------- 4 files changed, 134 deletions(-) delete mode 100644 include/linux/gpio/legacy-of-mm-gpiochip.h (limited to 'include/linux') diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 154d31b75070..ce237398fa00 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -42,14 +42,6 @@ config GPIOLIB_IRQCHIP select IRQ_DOMAIN bool -config OF_GPIO_MM_GPIOCHIP - bool - help - This adds support for the legacy 'struct of_mm_gpio_chip' interface - from PowerPC. Existing drivers using this interface need to select - this symbol, but new drivers should use the generic gpio-regmap - infrastructure instead. - config DEBUG_GPIO bool "Debug GPIO calls" depends on DEBUG_KERNEL diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO index 8ed74e05903a..5acaeab029ec 100644 --- a/drivers/gpio/TODO +++ b/drivers/gpio/TODO @@ -86,17 +86,6 @@ Work items: ------------------------------------------------------------------------------- -Get rid of - -Work items: - -- Get rid of struct of_mm_gpio_chip altogether: use the generic MMIO - GPIO for all current users (see below). Delete struct of_mm_gpio_chip, - to_of_mm_gpio_chip(), of_mm_gpiochip_add_data(), of_mm_gpiochip_remove(), - CONFIG_OF_GPIO_MM_GPIOCHIP from the kernel. - -------------------------------------------------------------------------------- - Collect drivers Collect GPIO drivers from arch/* and other places that should be placed diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index fad4edf9cc5c..8657379e9165 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -1031,85 +1031,6 @@ static int of_gpio_threecell_xlate(struct gpio_chip *gc, return gpiospec->args[1]; } -#if IS_ENABLED(CONFIG_OF_GPIO_MM_GPIOCHIP) -#include -/** - * of_mm_gpiochip_add_data - Add memory mapped GPIO chip (bank) - * @np: device node of the GPIO chip - * @mm_gc: pointer to the of_mm_gpio_chip allocated structure - * @data: driver data to store in the struct gpio_chip - * - * To use this function you should allocate and fill mm_gc with: - * - * 1) In the gpio_chip structure: - * - all the callbacks - * - of_gpio_n_cells - * - of_xlate callback (optional) - * - * 3) In the of_mm_gpio_chip structure: - * - save_regs callback (optional) - * - * If succeeded, this function will map bank's memory and will - * do all necessary work for you. Then you'll able to use .regs - * to manage GPIOs from the callbacks. - * - * Returns: - * 0 on success, or negative errno on failure. - */ -int of_mm_gpiochip_add_data(struct device_node *np, - struct of_mm_gpio_chip *mm_gc, - void *data) -{ - int ret = -ENOMEM; - struct gpio_chip *gc = &mm_gc->gc; - - gc->label = kasprintf(GFP_KERNEL, "%pOF", np); - if (!gc->label) - goto err0; - - mm_gc->regs = of_iomap(np, 0); - if (!mm_gc->regs) - goto err1; - - gc->base = -1; - - if (mm_gc->save_regs) - mm_gc->save_regs(mm_gc); - - fwnode_handle_put(mm_gc->gc.fwnode); - mm_gc->gc.fwnode = fwnode_handle_get(of_fwnode_handle(np)); - - ret = gpiochip_add_data(gc, data); - if (ret) - goto err2; - - return 0; -err2: - of_node_put(np); - iounmap(mm_gc->regs); -err1: - kfree(gc->label); -err0: - pr_err("%pOF: GPIO chip registration failed with status %d\n", np, ret); - return ret; -} -EXPORT_SYMBOL_GPL(of_mm_gpiochip_add_data); - -/** - * of_mm_gpiochip_remove - Remove memory mapped GPIO chip (bank) - * @mm_gc: pointer to the of_mm_gpio_chip allocated structure - */ -void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc) -{ - struct gpio_chip *gc = &mm_gc->gc; - - gpiochip_remove(gc); - iounmap(mm_gc->regs); - kfree(gc->label); -} -EXPORT_SYMBOL_GPL(of_mm_gpiochip_remove); -#endif - #ifdef CONFIG_PINCTRL static int of_gpiochip_add_pin_range(struct gpio_chip *chip) { diff --git a/include/linux/gpio/legacy-of-mm-gpiochip.h b/include/linux/gpio/legacy-of-mm-gpiochip.h deleted file mode 100644 index 2e2bd3b19cc3..000000000000 --- a/include/linux/gpio/legacy-of-mm-gpiochip.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * OF helpers for the old of_mm_gpio_chip, used on ppc32 and nios2, - * do not use in new code. - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - */ - -#ifndef __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H -#define __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H - -#include -#include - -/* - * OF GPIO chip for memory mapped banks - */ -struct of_mm_gpio_chip { - struct gpio_chip gc; - void (*save_regs)(struct of_mm_gpio_chip *mm_gc); - void __iomem *regs; -}; - -static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc) -{ - return container_of(gc, struct of_mm_gpio_chip, gc); -} - -extern int of_mm_gpiochip_add_data(struct device_node *np, - struct of_mm_gpio_chip *mm_gc, - void *data); -extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc); - -#endif /* __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H */ -- cgit v1.2.3 From 1e3e330c07076a0582385bbea029c9cc918fa30d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Oct 2025 11:46:11 +0200 Subject: irqchip: Pass platform device to platform drivers The IRQCHIP_PLATFORM_DRIVER macros can be used to convert OF irqchip drivers to platform drivers but currently reuse the OF init callback prototype that only takes OF nodes as arguments. This forces drivers to do reverse lookups of their struct devices during probe if they need them for things like dev_printk() and device managed resources. Half of the drivers doing reverse lookups also currently fail to release the additional reference taken during the lookup, while other drivers have had the reference leak plugged in various ways (e.g. using non-intuitive cleanup constructs which still confuse static checkers). Switch to using a probe callback that takes a platform device as its first argument to simplify drivers and plug the remaining (mostly benign) reference leaks. Fixes: 32c6c054661a ("irqchip: Add Broadcom BCM2712 MSI-X interrupt controller") Fixes: 70afdab904d2 ("irqchip: Add IMX MU MSI controller driver") Fixes: a6199bb514d8 ("irqchip: Add Qualcomm MPM controller driver") Signed-off-by: Johan Hovold Signed-off-by: Thomas Gleixner Reviewed-by: Florian Fainelli Reviewed-by: Changhuang Liang --- drivers/irqchip/irq-bcm2712-mip.c | 10 +++------ drivers/irqchip/irq-bcm7038-l1.c | 5 +++-- drivers/irqchip/irq-bcm7120-l2.c | 20 +++++------------ drivers/irqchip/irq-brcmstb-l2.c | 21 +++++++++--------- drivers/irqchip/irq-imx-mu-msi.c | 24 ++++++++++---------- drivers/irqchip/irq-mchp-eic.c | 5 +++-- drivers/irqchip/irq-meson-gpio.c | 5 +++-- drivers/irqchip/irq-qcom-mpm.c | 6 ++--- drivers/irqchip/irq-renesas-rzg2l.c | 35 +++++++++--------------------- drivers/irqchip/irq-renesas-rzv2h.c | 32 ++++++++------------------- drivers/irqchip/irq-starfive-jh8100-intc.c | 5 +++-- drivers/irqchip/irqchip.c | 6 ++--- drivers/irqchip/qcom-pdc.c | 5 +++-- include/linux/irqchip.h | 8 ++++++- 14 files changed, 78 insertions(+), 109 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-bcm2712-mip.c b/drivers/irqchip/irq-bcm2712-mip.c index 8466646e5a2d..4761974ad650 100644 --- a/drivers/irqchip/irq-bcm2712-mip.c +++ b/drivers/irqchip/irq-bcm2712-mip.c @@ -232,16 +232,12 @@ err_put: return ret; } -static int mip_of_msi_init(struct device_node *node, struct device_node *parent) +static int mip_msi_probe(struct platform_device *pdev, struct device_node *parent) { - struct platform_device *pdev; + struct device_node *node = pdev->dev.of_node; struct mip_priv *mip; int ret; - pdev = of_find_device_by_node(node); - if (!pdev) - return -EPROBE_DEFER; - mip = kzalloc(sizeof(*mip), GFP_KERNEL); if (!mip) return -ENOMEM; @@ -284,7 +280,7 @@ err_priv: } IRQCHIP_PLATFORM_DRIVER_BEGIN(mip_msi) -IRQCHIP_MATCH("brcm,bcm2712-mip", mip_of_msi_init) +IRQCHIP_MATCH("brcm,bcm2712-mip", mip_msi_probe) IRQCHIP_PLATFORM_DRIVER_END(mip_msi) MODULE_DESCRIPTION("Broadcom BCM2712 MSI-X interrupt controller"); MODULE_AUTHOR("Phil Elwell "); diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c index eda33bd5d080..821b288587ca 100644 --- a/drivers/irqchip/irq-bcm7038-l1.c +++ b/drivers/irqchip/irq-bcm7038-l1.c @@ -394,8 +394,9 @@ static const struct irq_domain_ops bcm7038_l1_domain_ops = { .map = bcm7038_l1_map, }; -static int bcm7038_l1_of_init(struct device_node *dn, struct device_node *parent) +static int bcm7038_l1_probe(struct platform_device *pdev, struct device_node *parent) { + struct device_node *dn = pdev->dev.of_node; struct bcm7038_l1_chip *intc; int idx, ret; @@ -453,7 +454,7 @@ out_free: } IRQCHIP_PLATFORM_DRIVER_BEGIN(bcm7038_l1) -IRQCHIP_MATCH("brcm,bcm7038-l1-intc", bcm7038_l1_of_init) +IRQCHIP_MATCH("brcm,bcm7038-l1-intc", bcm7038_l1_probe) IRQCHIP_PLATFORM_DRIVER_END(bcm7038_l1) MODULE_DESCRIPTION("Broadcom STB 7038-style L1/L2 interrupt controller"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/irqchip/irq-bcm7120-l2.c b/drivers/irqchip/irq-bcm7120-l2.c index b6c85560c42e..518c9d4366a5 100644 --- a/drivers/irqchip/irq-bcm7120-l2.c +++ b/drivers/irqchip/irq-bcm7120-l2.c @@ -206,14 +206,14 @@ static int bcm7120_l2_intc_iomap_3380(struct device_node *dn, struct bcm7120_l2_ return 0; } -static int bcm7120_l2_intc_probe(struct device_node *dn, struct device_node *parent, +static int bcm7120_l2_intc_probe(struct platform_device *pdev, struct device_node *parent, int (*iomap_regs_fn)(struct device_node *, struct bcm7120_l2_intc_data *), const char *intc_name) { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; + struct device_node *dn = pdev->dev.of_node; struct bcm7120_l2_intc_data *data; - struct platform_device *pdev; struct irq_chip_generic *gc; struct irq_chip_type *ct; int ret = 0; @@ -224,14 +224,7 @@ static int bcm7120_l2_intc_probe(struct device_node *dn, struct device_node *par if (!data) return -ENOMEM; - pdev = of_find_device_by_node(dn); - if (!pdev) { - ret = -ENODEV; - goto out_free_data; - } - data->num_parent_irqs = platform_irq_count(pdev); - put_device(&pdev->dev); if (data->num_parent_irqs <= 0) { pr_err("invalid number of parent interrupts\n"); ret = -ENOMEM; @@ -331,20 +324,19 @@ out_unmap: if (data->map_base[idx]) iounmap(data->map_base[idx]); } -out_free_data: kfree(data); return ret; } -static int bcm7120_l2_intc_probe_7120(struct device_node *dn, struct device_node *parent) +static int bcm7120_l2_intc_probe_7120(struct platform_device *pdev, struct device_node *parent) { - return bcm7120_l2_intc_probe(dn, parent, bcm7120_l2_intc_iomap_7120, + return bcm7120_l2_intc_probe(pdev, parent, bcm7120_l2_intc_iomap_7120, "BCM7120 L2"); } -static int bcm7120_l2_intc_probe_3380(struct device_node *dn, struct device_node *parent) +static int bcm7120_l2_intc_probe_3380(struct platform_device *pdev, struct device_node *parent) { - return bcm7120_l2_intc_probe(dn, parent, bcm7120_l2_intc_iomap_3380, + return bcm7120_l2_intc_probe(pdev, parent, bcm7120_l2_intc_iomap_3380, "BCM3380 L2"); } diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index 53e67c6c01f7..bb7078d6524f 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -138,11 +138,12 @@ static void brcmstb_l2_intc_resume(struct irq_data *d) irq_reg_writel(gc, ~b->saved_mask, ct->regs.enable); } -static int brcmstb_l2_intc_of_init(struct device_node *np, struct device_node *parent, - const struct brcmstb_intc_init_params *init_params) +static int brcmstb_l2_intc_probe(struct platform_device *pdev, struct device_node *parent, + const struct brcmstb_intc_init_params *init_params) { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; unsigned int set = 0; + struct device_node *np = pdev->dev.of_node; struct brcmstb_l2_intc_data *data; struct irq_chip_type *ct; int ret; @@ -255,21 +256,21 @@ out_free: return ret; } -static int brcmstb_l2_edge_intc_of_init(struct device_node *np, struct device_node *parent) +static int brcmstb_l2_edge_intc_probe(struct platform_device *pdev, struct device_node *parent) { - return brcmstb_l2_intc_of_init(np, parent, &l2_edge_intc_init); + return brcmstb_l2_intc_probe(pdev, parent, &l2_edge_intc_init); } -static int brcmstb_l2_lvl_intc_of_init(struct device_node *np, struct device_node *parent) +static int brcmstb_l2_lvl_intc_probe(struct platform_device *pdev, struct device_node *parent) { - return brcmstb_l2_intc_of_init(np, parent, &l2_lvl_intc_init); + return brcmstb_l2_intc_probe(pdev, parent, &l2_lvl_intc_init); } IRQCHIP_PLATFORM_DRIVER_BEGIN(brcmstb_l2) -IRQCHIP_MATCH("brcm,l2-intc", brcmstb_l2_edge_intc_of_init) -IRQCHIP_MATCH("brcm,hif-spi-l2-intc", brcmstb_l2_edge_intc_of_init) -IRQCHIP_MATCH("brcm,upg-aux-aon-l2-intc", brcmstb_l2_edge_intc_of_init) -IRQCHIP_MATCH("brcm,bcm7271-l2-intc", brcmstb_l2_lvl_intc_of_init) +IRQCHIP_MATCH("brcm,l2-intc", brcmstb_l2_edge_intc_probe) +IRQCHIP_MATCH("brcm,hif-spi-l2-intc", brcmstb_l2_edge_intc_probe) +IRQCHIP_MATCH("brcm,upg-aux-aon-l2-intc", brcmstb_l2_edge_intc_probe) +IRQCHIP_MATCH("brcm,bcm7271-l2-intc", brcmstb_l2_lvl_intc_probe) IRQCHIP_PLATFORM_DRIVER_END(brcmstb_l2) MODULE_DESCRIPTION("Broadcom STB generic L2 interrupt controller"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/irqchip/irq-imx-mu-msi.c b/drivers/irqchip/irq-imx-mu-msi.c index 41df168aa7da..c598f2f52fc6 100644 --- a/drivers/irqchip/irq-imx-mu-msi.c +++ b/drivers/irqchip/irq-imx-mu-msi.c @@ -296,10 +296,9 @@ static const struct imx_mu_dcfg imx_mu_cfg_imx8ulp = { }, }; -static int imx_mu_of_init(struct device_node *dn, struct device_node *parent, - const struct imx_mu_dcfg *cfg) +static int imx_mu_probe(struct platform_device *pdev, struct device_node *parent, + const struct imx_mu_dcfg *cfg) { - struct platform_device *pdev = of_find_device_by_node(dn); struct device_link *pd_link_a; struct device_link *pd_link_b; struct imx_mu_msi *msi_data; @@ -415,28 +414,27 @@ static const struct dev_pm_ops imx_mu_pm_ops = { imx_mu_runtime_resume, NULL) }; -static int imx_mu_imx7ulp_of_init(struct device_node *dn, struct device_node *parent) +static int imx_mu_imx7ulp_probe(struct platform_device *pdev, struct device_node *parent) { - return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx7ulp); + return imx_mu_probe(pdev, parent, &imx_mu_cfg_imx7ulp); } -static int imx_mu_imx6sx_of_init(struct device_node *dn, struct device_node *parent) +static int imx_mu_imx6sx_probe(struct platform_device *pdev, struct device_node *parent) { - return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx6sx); + return imx_mu_probe(pdev, parent, &imx_mu_cfg_imx6sx); } -static int imx_mu_imx8ulp_of_init(struct device_node *dn, struct device_node *parent) +static int imx_mu_imx8ulp_probe(struct platform_device *pdev, struct device_node *parent) { - return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx8ulp); + return imx_mu_probe(pdev, parent, &imx_mu_cfg_imx8ulp); } IRQCHIP_PLATFORM_DRIVER_BEGIN(imx_mu_msi) -IRQCHIP_MATCH("fsl,imx7ulp-mu-msi", imx_mu_imx7ulp_of_init) -IRQCHIP_MATCH("fsl,imx6sx-mu-msi", imx_mu_imx6sx_of_init) -IRQCHIP_MATCH("fsl,imx8ulp-mu-msi", imx_mu_imx8ulp_of_init) +IRQCHIP_MATCH("fsl,imx7ulp-mu-msi", imx_mu_imx7ulp_probe) +IRQCHIP_MATCH("fsl,imx6sx-mu-msi", imx_mu_imx6sx_probe) +IRQCHIP_MATCH("fsl,imx8ulp-mu-msi", imx_mu_imx8ulp_probe) IRQCHIP_PLATFORM_DRIVER_END(imx_mu_msi, .pm = &imx_mu_pm_ops) - MODULE_AUTHOR("Frank Li "); MODULE_DESCRIPTION("Freescale MU MSI controller driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/irqchip/irq-mchp-eic.c b/drivers/irqchip/irq-mchp-eic.c index 516a3a0e359c..b513a899c085 100644 --- a/drivers/irqchip/irq-mchp-eic.c +++ b/drivers/irqchip/irq-mchp-eic.c @@ -199,8 +199,9 @@ static const struct irq_domain_ops mchp_eic_domain_ops = { .free = irq_domain_free_irqs_common, }; -static int mchp_eic_init(struct device_node *node, struct device_node *parent) +static int mchp_eic_probe(struct platform_device *pdev, struct device_node *parent) { + struct device_node *node = pdev->dev.of_node; struct irq_domain *parent_domain = NULL; int ret, i; @@ -273,7 +274,7 @@ free: } IRQCHIP_PLATFORM_DRIVER_BEGIN(mchp_eic) -IRQCHIP_MATCH("microchip,sama7g5-eic", mchp_eic_init) +IRQCHIP_MATCH("microchip,sama7g5-eic", mchp_eic_probe) IRQCHIP_PLATFORM_DRIVER_END(mchp_eic) MODULE_DESCRIPTION("Microchip External Interrupt Controller"); diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c index 7d177626d64b..09ebf1d9c21b 100644 --- a/drivers/irqchip/irq-meson-gpio.c +++ b/drivers/irqchip/irq-meson-gpio.c @@ -572,8 +572,9 @@ static int meson_gpio_irq_parse_dt(struct device_node *node, struct meson_gpio_i return 0; } -static int meson_gpio_irq_of_init(struct device_node *node, struct device_node *parent) +static int meson_gpio_irq_probe(struct platform_device *pdev, struct device_node *parent) { + struct device_node *node = pdev->dev.of_node; struct irq_domain *domain, *parent_domain; struct meson_gpio_irq_controller *ctl; int ret; @@ -630,7 +631,7 @@ free_ctl: } IRQCHIP_PLATFORM_DRIVER_BEGIN(meson_gpio_intc) -IRQCHIP_MATCH("amlogic,meson-gpio-intc", meson_gpio_irq_of_init) +IRQCHIP_MATCH("amlogic,meson-gpio-intc", meson_gpio_irq_probe) IRQCHIP_PLATFORM_DRIVER_END(meson_gpio_intc) MODULE_AUTHOR("Jerome Brunet "); diff --git a/drivers/irqchip/irq-qcom-mpm.c b/drivers/irqchip/irq-qcom-mpm.c index 8d569f7c5a7a..83f31ea657b7 100644 --- a/drivers/irqchip/irq-qcom-mpm.c +++ b/drivers/irqchip/irq-qcom-mpm.c @@ -320,9 +320,9 @@ static bool gic_hwirq_is_mapped(struct mpm_gic_map *maps, int cnt, u32 hwirq) return false; } -static int qcom_mpm_init(struct device_node *np, struct device_node *parent) +static int qcom_mpm_probe(struct platform_device *pdev, struct device_node *parent) { - struct platform_device *pdev = of_find_device_by_node(np); + struct device_node *np = pdev->dev.of_node; struct device *dev = &pdev->dev; struct irq_domain *parent_domain; struct generic_pm_domain *genpd; @@ -478,7 +478,7 @@ remove_genpd: } IRQCHIP_PLATFORM_DRIVER_BEGIN(qcom_mpm) -IRQCHIP_MATCH("qcom,mpm", qcom_mpm_init) +IRQCHIP_MATCH("qcom,mpm", qcom_mpm_probe) IRQCHIP_PLATFORM_DRIVER_END(qcom_mpm) MODULE_DESCRIPTION("Qualcomm Technologies, Inc. MSM Power Manager"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 12b6eb150301..1bf19deb02c4 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -8,7 +8,6 @@ */ #include -#include #include #include #include @@ -528,18 +527,15 @@ static int rzg2l_irqc_parse_interrupts(struct rzg2l_irqc_priv *priv, return 0; } -static int rzg2l_irqc_common_init(struct device_node *node, struct device_node *parent, - const struct irq_chip *irq_chip) +static int rzg2l_irqc_common_probe(struct platform_device *pdev, struct device_node *parent, + const struct irq_chip *irq_chip) { - struct platform_device *pdev = of_find_device_by_node(node); - struct device *dev __free(put_device) = pdev ? &pdev->dev : NULL; struct irq_domain *irq_domain, *parent_domain; + struct device_node *node = pdev->dev.of_node; + struct device *dev = &pdev->dev; struct reset_control *resetn; int ret; - if (!pdev) - return -ENODEV; - parent_domain = irq_find_host(parent); if (!parent_domain) return dev_err_probe(dev, -ENODEV, "cannot find parent domain\n"); @@ -583,33 +579,22 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * register_syscore_ops(&rzg2l_irqc_syscore_ops); - /* - * Prevent the cleanup function from invoking put_device by assigning - * NULL to dev. - * - * make coccicheck will complain about missing put_device calls, but - * those are false positives, as dev will be automatically "put" via - * __free_put_device on the failing path. - * On the successful path we don't actually want to "put" dev. - */ - dev = NULL; - return 0; } -static int rzg2l_irqc_init(struct device_node *node, struct device_node *parent) +static int rzg2l_irqc_probe(struct platform_device *pdev, struct device_node *parent) { - return rzg2l_irqc_common_init(node, parent, &rzg2l_irqc_chip); + return rzg2l_irqc_common_probe(pdev, parent, &rzg2l_irqc_chip); } -static int rzfive_irqc_init(struct device_node *node, struct device_node *parent) +static int rzfive_irqc_probe(struct platform_device *pdev, struct device_node *parent) { - return rzg2l_irqc_common_init(node, parent, &rzfive_irqc_chip); + return rzg2l_irqc_common_probe(pdev, parent, &rzfive_irqc_chip); } IRQCHIP_PLATFORM_DRIVER_BEGIN(rzg2l_irqc) -IRQCHIP_MATCH("renesas,rzg2l-irqc", rzg2l_irqc_init) -IRQCHIP_MATCH("renesas,r9a07g043f-irqc", rzfive_irqc_init) +IRQCHIP_MATCH("renesas,rzg2l-irqc", rzg2l_irqc_probe) +IRQCHIP_MATCH("renesas,r9a07g043f-irqc", rzfive_irqc_probe) IRQCHIP_PLATFORM_DRIVER_END(rzg2l_irqc) MODULE_AUTHOR("Lad Prabhakar "); MODULE_DESCRIPTION("Renesas RZ/G2L IRQC Driver"); diff --git a/drivers/irqchip/irq-renesas-rzv2h.c b/drivers/irqchip/irq-renesas-rzv2h.c index 9018d9c3911e..899a423b5da8 100644 --- a/drivers/irqchip/irq-renesas-rzv2h.c +++ b/drivers/irqchip/irq-renesas-rzv2h.c @@ -490,29 +490,15 @@ static int rzv2h_icu_parse_interrupts(struct rzv2h_icu_priv *priv, struct device return 0; } -static void rzv2h_icu_put_device(void *data) -{ - put_device(data); -} - -static int rzv2h_icu_init_common(struct device_node *node, struct device_node *parent, - const struct rzv2h_hw_info *hw_info) +static int rzv2h_icu_probe_common(struct platform_device *pdev, struct device_node *parent, + const struct rzv2h_hw_info *hw_info) { struct irq_domain *irq_domain, *parent_domain; + struct device_node *node = pdev->dev.of_node; struct rzv2h_icu_priv *rzv2h_icu_data; - struct platform_device *pdev; struct reset_control *resetn; int ret; - pdev = of_find_device_by_node(node); - if (!pdev) - return -ENODEV; - - ret = devm_add_action_or_reset(&pdev->dev, rzv2h_icu_put_device, - &pdev->dev); - if (ret < 0) - return ret; - parent_domain = irq_find_host(parent); if (!parent_domain) { dev_err(&pdev->dev, "cannot find parent domain\n"); @@ -618,19 +604,19 @@ static const struct rzv2h_hw_info rzv2h_hw_params = { .field_width = 8, }; -static int rzg3e_icu_init(struct device_node *node, struct device_node *parent) +static int rzg3e_icu_probe(struct platform_device *pdev, struct device_node *parent) { - return rzv2h_icu_init_common(node, parent, &rzg3e_hw_params); + return rzv2h_icu_probe_common(pdev, parent, &rzg3e_hw_params); } -static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) +static int rzv2h_icu_probe(struct platform_device *pdev, struct device_node *parent) { - return rzv2h_icu_init_common(node, parent, &rzv2h_hw_params); + return rzv2h_icu_probe_common(pdev, parent, &rzv2h_hw_params); } IRQCHIP_PLATFORM_DRIVER_BEGIN(rzv2h_icu) -IRQCHIP_MATCH("renesas,r9a09g047-icu", rzg3e_icu_init) -IRQCHIP_MATCH("renesas,r9a09g057-icu", rzv2h_icu_init) +IRQCHIP_MATCH("renesas,r9a09g047-icu", rzg3e_icu_probe) +IRQCHIP_MATCH("renesas,r9a09g057-icu", rzv2h_icu_probe) IRQCHIP_PLATFORM_DRIVER_END(rzv2h_icu) MODULE_AUTHOR("Fabrizio Castro "); MODULE_DESCRIPTION("Renesas RZ/V2H(P) ICU Driver"); diff --git a/drivers/irqchip/irq-starfive-jh8100-intc.c b/drivers/irqchip/irq-starfive-jh8100-intc.c index 117f2c651ebd..705361b4ebe0 100644 --- a/drivers/irqchip/irq-starfive-jh8100-intc.c +++ b/drivers/irqchip/irq-starfive-jh8100-intc.c @@ -114,8 +114,9 @@ static void starfive_intc_irq_handler(struct irq_desc *desc) chained_irq_exit(chip, desc); } -static int starfive_intc_init(struct device_node *intc, struct device_node *parent) +static int starfive_intc_probe(struct platform_device *pdev, struct device_node *parent) { + struct device_node *intc = pdev->dev.of_node; struct starfive_irq_chip *irqc; struct reset_control *rst; struct clk *clk; @@ -198,7 +199,7 @@ err_free: } IRQCHIP_PLATFORM_DRIVER_BEGIN(starfive_intc) -IRQCHIP_MATCH("starfive,jh8100-intc", starfive_intc_init) +IRQCHIP_MATCH("starfive,jh8100-intc", starfive_intc_probe) IRQCHIP_PLATFORM_DRIVER_END(starfive_intc) MODULE_DESCRIPTION("StarFive JH8100 External Interrupt Controller"); diff --git a/drivers/irqchip/irqchip.c b/drivers/irqchip/irqchip.c index 652d20d2b07f..689c8e448901 100644 --- a/drivers/irqchip/irqchip.c +++ b/drivers/irqchip/irqchip.c @@ -36,9 +36,9 @@ int platform_irqchip_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; struct device_node *par_np __free(device_node) = of_irq_find_parent(np); - of_irq_init_cb_t irq_init_cb = of_device_get_match_data(&pdev->dev); + platform_irq_probe_t irq_probe = of_device_get_match_data(&pdev->dev); - if (!irq_init_cb) + if (!irq_probe) return -EINVAL; if (par_np == np) @@ -55,6 +55,6 @@ int platform_irqchip_probe(struct platform_device *pdev) if (par_np && !irq_find_matching_host(par_np, DOMAIN_BUS_ANY)) return -EPROBE_DEFER; - return irq_init_cb(np, par_np); + return irq_probe(pdev, par_np); } EXPORT_SYMBOL_GPL(platform_irqchip_probe); diff --git a/drivers/irqchip/qcom-pdc.c b/drivers/irqchip/qcom-pdc.c index 52d77546aacb..518f7f0f3dab 100644 --- a/drivers/irqchip/qcom-pdc.c +++ b/drivers/irqchip/qcom-pdc.c @@ -350,9 +350,10 @@ static int pdc_setup_pin_mapping(struct device_node *np) #define QCOM_PDC_SIZE 0x30000 -static int qcom_pdc_init(struct device_node *node, struct device_node *parent) +static int qcom_pdc_probe(struct platform_device *pdev, struct device_node *parent) { struct irq_domain *parent_domain, *pdc_domain; + struct device_node *node = pdev->dev.of_node; resource_size_t res_size; struct resource res; int ret; @@ -428,7 +429,7 @@ fail: } IRQCHIP_PLATFORM_DRIVER_BEGIN(qcom_pdc) -IRQCHIP_MATCH("qcom,pdc", qcom_pdc_init) +IRQCHIP_MATCH("qcom,pdc", qcom_pdc_probe) IRQCHIP_PLATFORM_DRIVER_END(qcom_pdc) MODULE_DESCRIPTION("Qualcomm Technologies, Inc. Power Domain Controller"); MODULE_LICENSE("GPL v2"); diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index d5e6024cb2a8..bc4ddacd6ddc 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -17,12 +17,18 @@ #include #include +typedef int (*platform_irq_probe_t)(struct platform_device *, struct device_node *); + /* Undefined on purpose */ extern of_irq_init_cb_t typecheck_irq_init_cb; +extern platform_irq_probe_t typecheck_irq_probe; #define typecheck_irq_init_cb(fn) \ (__typecheck(typecheck_irq_init_cb, &fn) ? fn : fn) +#define typecheck_irq_probe(fn) \ + (__typecheck(typecheck_irq_probe, &fn) ? fn : fn) + /* * This macro must be used by the different irqchip drivers to declare * the association between their DT compatible string and their @@ -42,7 +48,7 @@ extern int platform_irqchip_probe(struct platform_device *pdev); static const struct of_device_id drv_name##_irqchip_match_table[] = { #define IRQCHIP_MATCH(compat, fn) { .compatible = compat, \ - .data = typecheck_irq_init_cb(fn), }, + .data = typecheck_irq_probe(fn), }, #define IRQCHIP_PLATFORM_DRIVER_END(drv_name, ...) \ -- cgit v1.2.3 From dce745009349fc391271c9415d5e242781ddadd7 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 21 Jul 2025 08:36:26 +0200 Subject: PCI/MSI: Delete pci_msi_create_irq_domain() pci_msi_create_irq_domain() is now unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Acked-by: Bjorn Helgaas --- drivers/pci/msi/irqdomain.c | 90 --------------------------------------------- include/linux/msi.h | 3 -- 2 files changed, 93 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index ce741ed9dc3f..a329060287b5 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -49,96 +49,6 @@ static void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg * __pci_write_msi_msg(desc, msg); } -/** - * pci_msi_domain_calc_hwirq - Generate a unique ID for an MSI source - * @desc: Pointer to the MSI descriptor - * - * The ID number is only used within the irqdomain. - */ -static irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc) -{ - struct pci_dev *dev = msi_desc_to_pci_dev(desc); - - return (irq_hw_number_t)desc->msi_index | - pci_dev_id(dev) << 11 | - ((irq_hw_number_t)(pci_domain_nr(dev->bus) & 0xFFFFFFFF)) << 27; -} - -static void pci_msi_domain_set_desc(msi_alloc_info_t *arg, - struct msi_desc *desc) -{ - arg->desc = desc; - arg->hwirq = pci_msi_domain_calc_hwirq(desc); -} - -static struct msi_domain_ops pci_msi_domain_ops_default = { - .set_desc = pci_msi_domain_set_desc, -}; - -static void pci_msi_domain_update_dom_ops(struct msi_domain_info *info) -{ - struct msi_domain_ops *ops = info->ops; - - if (ops == NULL) { - info->ops = &pci_msi_domain_ops_default; - } else { - if (ops->set_desc == NULL) - ops->set_desc = pci_msi_domain_set_desc; - } -} - -static void pci_msi_domain_update_chip_ops(struct msi_domain_info *info) -{ - struct irq_chip *chip = info->chip; - - BUG_ON(!chip); - if (!chip->irq_write_msi_msg) - chip->irq_write_msi_msg = pci_msi_domain_write_msg; - if (!chip->irq_mask) - chip->irq_mask = pci_msi_mask_irq; - if (!chip->irq_unmask) - chip->irq_unmask = pci_msi_unmask_irq; -} - -/** - * pci_msi_create_irq_domain - Create a MSI interrupt domain - * @fwnode: Optional fwnode of the interrupt controller - * @info: MSI domain info - * @parent: Parent irq domain - * - * Updates the domain and chip ops and creates a MSI interrupt domain. - * - * Returns: - * A domain pointer or NULL in case of failure. - */ -struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, - struct msi_domain_info *info, - struct irq_domain *parent) -{ - if (WARN_ON(info->flags & MSI_FLAG_LEVEL_CAPABLE)) - info->flags &= ~MSI_FLAG_LEVEL_CAPABLE; - - if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) - pci_msi_domain_update_dom_ops(info); - if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) - pci_msi_domain_update_chip_ops(info); - - /* Let the core code free MSI descriptors when freeing interrupts */ - info->flags |= MSI_FLAG_FREE_MSI_DESCS; - - info->flags |= MSI_FLAG_ACTIVATE_EARLY | MSI_FLAG_DEV_SYSFS; - if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) - info->flags |= MSI_FLAG_MUST_REACTIVATE; - - /* PCI-MSI is oneshot-safe */ - info->chip->flags |= IRQCHIP_ONESHOT_SAFE; - /* Let the core update the bus token */ - info->bus_token = DOMAIN_BUS_PCI_MSI; - - return msi_create_irq_domain(fwnode, info, parent); -} -EXPORT_SYMBOL_GPL(pci_msi_create_irq_domain); - /* * Per device MSI[-X] domain functionality */ diff --git a/include/linux/msi.h b/include/linux/msi.h index d415dd15a0a9..8003e3218c46 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -701,9 +701,6 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); -struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, - struct msi_domain_info *info, - struct irq_domain *parent); u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev); u32 pci_msi_map_rid_ctlr_node(struct pci_dev *pdev, struct device_node **node); struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev); -- cgit v1.2.3 From fe946a751d9b52b7c45ca34899723b314b79b249 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Oct 2025 17:19:04 +0000 Subject: net/sched: act_mirred: add loop detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 0f022d32c3ec ("net/sched: Fix mirred deadlock on device recursion") added code in the fast path, even when act_mirred is not used. Prepare its revert by implementing loop detection in act_mirred. Adds an array of device pointers in struct netdev_xmit. tcf_mirred_is_act_redirect() can detect if the array already contains the target device. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Toke Høiland-Jørgensen Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20251014171907.3554413-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice_xmit.h | 9 +++++- net/sched/act_mirred.c | 62 ++++++++++++++++-------------------------- 2 files changed, 31 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 813a19122ebb..cc232508e695 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -2,6 +2,12 @@ #ifndef _LINUX_NETDEVICE_XMIT_H #define _LINUX_NETDEVICE_XMIT_H +#if IS_ENABLED(CONFIG_NET_ACT_MIRRED) +#define MIRRED_NEST_LIMIT 4 +#endif + +struct net_device; + struct netdev_xmit { u16 recursion; u8 more; @@ -9,7 +15,8 @@ struct netdev_xmit { u8 skip_txqueue; #endif #if IS_ENABLED(CONFIG_NET_ACT_MIRRED) - u8 sched_mirred_nest; + u8 sched_mirred_nest; + struct net_device *sched_mirred_dev[MIRRED_NEST_LIMIT]; #endif #if IS_ENABLED(CONFIG_NF_DUP_NETDEV) u8 nf_dup_skb_recursion; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5f01f567c934..f27b583def78 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -29,31 +29,6 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); -#define MIRRED_NEST_LIMIT 4 - -#ifndef CONFIG_PREEMPT_RT -static u8 tcf_mirred_nest_level_inc_return(void) -{ - return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); -} - -static void tcf_mirred_nest_level_dec(void) -{ - __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); -} - -#else -static u8 tcf_mirred_nest_level_inc_return(void) -{ - return current->net_xmit.sched_mirred_nest++; -} - -static void tcf_mirred_nest_level_dec(void) -{ - current->net_xmit.sched_mirred_nest--; -} -#endif - static bool tcf_mirred_is_act_redirect(int action) { return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; @@ -439,44 +414,53 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, { struct tcf_mirred *m = to_mirred(a); int retval = READ_ONCE(m->tcf_action); - unsigned int nest_level; + struct netdev_xmit *xmit; bool m_mac_header_xmit; struct net_device *dev; - int m_eaction; + int i, m_eaction; u32 blockid; - nest_level = tcf_mirred_nest_level_inc_return(); - if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { +#ifdef CONFIG_PREEMPT_RT + xmit = ¤t->net_xmit; +#else + xmit = this_cpu_ptr(&softnet_data.xmit); +#endif + if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); - retval = TC_ACT_SHOT; - goto dec_nest_level; + return TC_ACT_SHOT; } tcf_lastuse_update(&m->tcf_tm); tcf_action_update_bstats(&m->common, skb); blockid = READ_ONCE(m->tcfm_blockid); - if (blockid) { - retval = tcf_blockcast(skb, m, blockid, res, retval); - goto dec_nest_level; - } + if (blockid) + return tcf_blockcast(skb, m, blockid, res, retval); dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); tcf_action_inc_overlimit_qstats(&m->common); - goto dec_nest_level; + return retval; } + for (i = 0; i < xmit->sched_mirred_nest; i++) { + if (xmit->sched_mirred_dev[i] != dev) + continue; + pr_notice_once("tc mirred: loop on device %s\n", + netdev_name(dev)); + tcf_action_inc_overlimit_qstats(&m->common); + return retval; + } + + xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev; m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction, retval); - -dec_nest_level: - tcf_mirred_nest_level_dec(); + xmit->sched_mirred_nest--; return retval; } -- cgit v1.2.3 From f968a24cad3da72fdff12a0ae5ac0b679439cca1 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Fri, 3 Oct 2025 12:16:38 +0900 Subject: can: treewide: remove can_change_mtu() can_change_mtu() became obsolete by commit 23049938605b ("can: populate the minimum and maximum MTU values"). Now that net_device->min_mtu and net_device->max_mtu are populated, all the checks are already done by dev_validate_mtu() in net/core/dev.c. Remove the net_device_ops->ndo_change_mtu() callback of all the physical interfaces, then remove can_change_mtu(). Only keep the vcan_change_mtu() and vxcan_change_mtu() because the virtual interfaces use their own different MTU logic. The only functional change this patch introduces is that now the user will be able to change the MTU even if the interface is up. This does not matter for Classical CAN and CAN FD because their MTU range is composed of only one value, respectively CAN_MTU and CANFD_MTU. For the upcoming CAN XL, the MTU will be configurable within the CANXL_MIN_MTU to CANXL_MAX_MTU range at any time, even if the interface is up. This is consistent with the other net protocols and does not contradict ISO 11898-1:2024 as having a modifiable MTU is a kernel extension. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20251003-remove-can_change_mtu-v1-1-337f8bc21181@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/at91_can.c | 1 - drivers/net/can/bxcan.c | 1 - drivers/net/can/c_can/c_can_main.c | 1 - drivers/net/can/can327.c | 1 - drivers/net/can/cc770/cc770.c | 1 - drivers/net/can/ctucanfd/ctucanfd_base.c | 1 - drivers/net/can/dev/dev.c | 38 ---------------------- drivers/net/can/esd/esd_402_pci-core.c | 1 - drivers/net/can/flexcan/flexcan-core.c | 1 - drivers/net/can/grcan.c | 1 - drivers/net/can/ifi_canfd/ifi_canfd.c | 1 - drivers/net/can/janz-ican3.c | 1 - drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c | 1 - drivers/net/can/m_can/m_can.c | 1 - drivers/net/can/mscan/mscan.c | 1 - drivers/net/can/peak_canfd/peak_canfd.c | 1 - drivers/net/can/rcar/rcar_can.c | 1 - drivers/net/can/rcar/rcar_canfd.c | 1 - drivers/net/can/rockchip/rockchip_canfd-core.c | 1 - drivers/net/can/sja1000/sja1000.c | 1 - drivers/net/can/slcan/slcan-core.c | 1 - drivers/net/can/softing/softing_main.c | 1 - drivers/net/can/spi/hi311x.c | 1 - drivers/net/can/spi/mcp251x.c | 1 - drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 1 - drivers/net/can/sun4i_can.c | 1 - drivers/net/can/ti_hecc.c | 1 - drivers/net/can/usb/ems_usb.c | 1 - drivers/net/can/usb/esd_usb.c | 1 - drivers/net/can/usb/etas_es58x/es58x_core.c | 1 - drivers/net/can/usb/f81604.c | 1 - drivers/net/can/usb/gs_usb.c | 1 - drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c | 1 - drivers/net/can/usb/mcba_usb.c | 1 - drivers/net/can/usb/nct6694_canfd.c | 1 - drivers/net/can/usb/peak_usb/pcan_usb_core.c | 1 - drivers/net/can/usb/ucan.c | 1 - drivers/net/can/usb/usb_8dev.c | 1 - drivers/net/can/xilinx_can.c | 1 - include/linux/can/dev.h | 1 - 40 files changed, 77 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index 191707d7e3da..c2a3a4eef5b2 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -948,7 +948,6 @@ static const struct net_device_ops at91_netdev_ops = { .ndo_open = at91_open, .ndo_stop = at91_close, .ndo_start_xmit = at91_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops at91_ethtool_ops = { diff --git a/drivers/net/can/bxcan.c b/drivers/net/can/bxcan.c index bfc60eb33dc3..9c3af7049814 100644 --- a/drivers/net/can/bxcan.c +++ b/drivers/net/can/bxcan.c @@ -881,7 +881,6 @@ static const struct net_device_ops bxcan_netdev_ops = { .ndo_open = bxcan_open, .ndo_stop = bxcan_stop, .ndo_start_xmit = bxcan_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops bxcan_ethtool_ops = { diff --git a/drivers/net/can/c_can/c_can_main.c b/drivers/net/can/c_can/c_can_main.c index cc371d0c9f3c..3702cac7fbf0 100644 --- a/drivers/net/can/c_can/c_can_main.c +++ b/drivers/net/can/c_can/c_can_main.c @@ -1362,7 +1362,6 @@ static const struct net_device_ops c_can_netdev_ops = { .ndo_open = c_can_open, .ndo_stop = c_can_close, .ndo_start_xmit = c_can_start_xmit, - .ndo_change_mtu = can_change_mtu, }; int register_c_can_dev(struct net_device *dev) diff --git a/drivers/net/can/can327.c b/drivers/net/can/can327.c index 24af63961030..b66fc16aedd2 100644 --- a/drivers/net/can/can327.c +++ b/drivers/net/can/can327.c @@ -849,7 +849,6 @@ static const struct net_device_ops can327_netdev_ops = { .ndo_open = can327_netdev_open, .ndo_stop = can327_netdev_close, .ndo_start_xmit = can327_netdev_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops can327_ethtool_ops = { diff --git a/drivers/net/can/cc770/cc770.c b/drivers/net/can/cc770/cc770.c index 30909f3aab57..8d5abd643c06 100644 --- a/drivers/net/can/cc770/cc770.c +++ b/drivers/net/can/cc770/cc770.c @@ -834,7 +834,6 @@ static const struct net_device_ops cc770_netdev_ops = { .ndo_open = cc770_open, .ndo_stop = cc770_close, .ndo_start_xmit = cc770_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops cc770_ethtool_ops = { diff --git a/drivers/net/can/ctucanfd/ctucanfd_base.c b/drivers/net/can/ctucanfd/ctucanfd_base.c index 8bd3f0fc385c..1e6b9e3dc2fe 100644 --- a/drivers/net/can/ctucanfd/ctucanfd_base.c +++ b/drivers/net/can/ctucanfd/ctucanfd_base.c @@ -1301,7 +1301,6 @@ static const struct net_device_ops ctucan_netdev_ops = { .ndo_open = ctucan_open, .ndo_stop = ctucan_close, .ndo_start_xmit = ctucan_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops ctucan_ethtool_ops = { diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 15ccedbb3f8d..0cc3d008adb3 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -359,44 +359,6 @@ void can_set_default_mtu(struct net_device *dev) } } -/* changing MTU and control mode for CAN/CANFD devices */ -int can_change_mtu(struct net_device *dev, int new_mtu) -{ - struct can_priv *priv = netdev_priv(dev); - u32 ctrlmode_static = can_get_static_ctrlmode(priv); - - /* Do not allow changing the MTU while running */ - if (dev->flags & IFF_UP) - return -EBUSY; - - /* allow change of MTU according to the CANFD ability of the device */ - switch (new_mtu) { - case CAN_MTU: - /* 'CANFD-only' controllers can not switch to CAN_MTU */ - if (ctrlmode_static & CAN_CTRLMODE_FD) - return -EINVAL; - - priv->ctrlmode &= ~CAN_CTRLMODE_FD; - break; - - case CANFD_MTU: - /* check for potential CANFD ability */ - if (!(priv->ctrlmode_supported & CAN_CTRLMODE_FD) && - !(ctrlmode_static & CAN_CTRLMODE_FD)) - return -EINVAL; - - priv->ctrlmode |= CAN_CTRLMODE_FD; - break; - - default: - return -EINVAL; - } - - WRITE_ONCE(dev->mtu, new_mtu); - return 0; -} -EXPORT_SYMBOL_GPL(can_change_mtu); - /* helper to define static CAN controller features at device creation time */ int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) { diff --git a/drivers/net/can/esd/esd_402_pci-core.c b/drivers/net/can/esd/esd_402_pci-core.c index 5d6d2828cd04..05adecae6375 100644 --- a/drivers/net/can/esd/esd_402_pci-core.c +++ b/drivers/net/can/esd/esd_402_pci-core.c @@ -86,7 +86,6 @@ static const struct net_device_ops pci402_acc_netdev_ops = { .ndo_open = acc_open, .ndo_stop = acc_close, .ndo_start_xmit = acc_start_xmit, - .ndo_change_mtu = can_change_mtu, .ndo_eth_ioctl = can_eth_ioctl_hwts, }; diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c index 06d5d35fc1b5..f5d22c61503f 100644 --- a/drivers/net/can/flexcan/flexcan-core.c +++ b/drivers/net/can/flexcan/flexcan-core.c @@ -1867,7 +1867,6 @@ static const struct net_device_ops flexcan_netdev_ops = { .ndo_open = flexcan_open, .ndo_stop = flexcan_close, .ndo_start_xmit = flexcan_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static int register_flexcandev(struct net_device *dev) diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c index c5784d9779ef..3b1b09943436 100644 --- a/drivers/net/can/grcan.c +++ b/drivers/net/can/grcan.c @@ -1561,7 +1561,6 @@ static const struct net_device_ops grcan_netdev_ops = { .ndo_open = grcan_open, .ndo_stop = grcan_close, .ndo_start_xmit = grcan_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops grcan_ethtool_ops = { diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c index 2eeee65f606f..0f83335e4d07 100644 --- a/drivers/net/can/ifi_canfd/ifi_canfd.c +++ b/drivers/net/can/ifi_canfd/ifi_canfd.c @@ -944,7 +944,6 @@ static const struct net_device_ops ifi_canfd_netdev_ops = { .ndo_open = ifi_canfd_open, .ndo_stop = ifi_canfd_close, .ndo_start_xmit = ifi_canfd_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops ifi_canfd_ethtool_ops = { diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c index bfa5cbe88017..1efdd1fd8caa 100644 --- a/drivers/net/can/janz-ican3.c +++ b/drivers/net/can/janz-ican3.c @@ -1752,7 +1752,6 @@ static const struct net_device_ops ican3_netdev_ops = { .ndo_open = ican3_open, .ndo_stop = ican3_stop, .ndo_start_xmit = ican3_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops ican3_ethtool_ops = { diff --git a/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c b/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c index 0880023611be..705f9bb74cd2 100644 --- a/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c +++ b/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c @@ -904,7 +904,6 @@ static const struct net_device_ops kvaser_pciefd_netdev_ops = { .ndo_stop = kvaser_pciefd_stop, .ndo_eth_ioctl = can_eth_ioctl_hwts, .ndo_start_xmit = kvaser_pciefd_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static int kvaser_pciefd_set_phys_id(struct net_device *netdev, diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 48b7a67336b5..873f5991fc5a 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -2148,7 +2148,6 @@ static const struct net_device_ops m_can_netdev_ops = { .ndo_open = m_can_open, .ndo_stop = m_can_close, .ndo_start_xmit = m_can_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static int m_can_get_coalesce(struct net_device *dev, diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c index 8c2a7bc64d3d..39c7aa2a0b2f 100644 --- a/drivers/net/can/mscan/mscan.c +++ b/drivers/net/can/mscan/mscan.c @@ -607,7 +607,6 @@ static const struct net_device_ops mscan_netdev_ops = { .ndo_open = mscan_open, .ndo_stop = mscan_close, .ndo_start_xmit = mscan_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops mscan_ethtool_ops = { diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c index b5bc80ac7876..a53c9d347b7b 100644 --- a/drivers/net/can/peak_canfd/peak_canfd.c +++ b/drivers/net/can/peak_canfd/peak_canfd.c @@ -773,7 +773,6 @@ static const struct net_device_ops peak_canfd_netdev_ops = { .ndo_stop = peak_canfd_close, .ndo_eth_ioctl = peak_eth_ioctl, .ndo_start_xmit = peak_canfd_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static int peak_get_ts_info(struct net_device *dev, diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c index 5f85f4e27205..fc3df328e877 100644 --- a/drivers/net/can/rcar/rcar_can.c +++ b/drivers/net/can/rcar/rcar_can.c @@ -635,7 +635,6 @@ static const struct net_device_ops rcar_can_netdev_ops = { .ndo_open = rcar_can_open, .ndo_stop = rcar_can_close, .ndo_start_xmit = rcar_can_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops rcar_can_ethtool_ops = { diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index 45d36adb51b7..49ab65274b51 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -1818,7 +1818,6 @@ static const struct net_device_ops rcar_canfd_netdev_ops = { .ndo_open = rcar_canfd_open, .ndo_stop = rcar_canfd_close, .ndo_start_xmit = rcar_canfd_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops rcar_canfd_ethtool_ops = { diff --git a/drivers/net/can/rockchip/rockchip_canfd-core.c b/drivers/net/can/rockchip/rockchip_canfd-core.c index 046f0a0ae4d4..29de0c01e4ed 100644 --- a/drivers/net/can/rockchip/rockchip_canfd-core.c +++ b/drivers/net/can/rockchip/rockchip_canfd-core.c @@ -761,7 +761,6 @@ static const struct net_device_ops rkcanfd_netdev_ops = { .ndo_open = rkcanfd_open, .ndo_stop = rkcanfd_stop, .ndo_start_xmit = rkcanfd_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static int __maybe_unused rkcanfd_runtime_suspend(struct device *dev) diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index 4d245857ef1c..acfa49db3907 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -697,7 +697,6 @@ static const struct net_device_ops sja1000_netdev_ops = { .ndo_open = sja1000_open, .ndo_stop = sja1000_close, .ndo_start_xmit = sja1000_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops sja1000_ethtool_ops = { diff --git a/drivers/net/can/slcan/slcan-core.c b/drivers/net/can/slcan/slcan-core.c index 58ff2ec1d975..cd789e178d34 100644 --- a/drivers/net/can/slcan/slcan-core.c +++ b/drivers/net/can/slcan/slcan-core.c @@ -774,7 +774,6 @@ static const struct net_device_ops slcan_netdev_ops = { .ndo_open = slcan_netdev_open, .ndo_stop = slcan_netdev_close, .ndo_start_xmit = slcan_netdev_xmit, - .ndo_change_mtu = can_change_mtu, }; /****************************************** diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c index 278ee8722770..79bc64395ac4 100644 --- a/drivers/net/can/softing/softing_main.c +++ b/drivers/net/can/softing/softing_main.c @@ -609,7 +609,6 @@ static const struct net_device_ops softing_netdev_ops = { .ndo_open = softing_netdev_open, .ndo_stop = softing_netdev_stop, .ndo_start_xmit = softing_netdev_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops softing_ethtool_ops = { diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c index 6d4b643e135f..e00d3dbc4cf4 100644 --- a/drivers/net/can/spi/hi311x.c +++ b/drivers/net/can/spi/hi311x.c @@ -799,7 +799,6 @@ static const struct net_device_ops hi3110_netdev_ops = { .ndo_open = hi3110_open, .ndo_stop = hi3110_stop, .ndo_start_xmit = hi3110_hard_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops hi3110_ethtool_ops = { diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c index b797e08499d7..1e54e1a22702 100644 --- a/drivers/net/can/spi/mcp251x.c +++ b/drivers/net/can/spi/mcp251x.c @@ -1270,7 +1270,6 @@ static const struct net_device_ops mcp251x_netdev_ops = { .ndo_open = mcp251x_open, .ndo_stop = mcp251x_stop, .ndo_start_xmit = mcp251x_hard_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops mcp251x_ethtool_ops = { diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 7450ea42c1ea..9402530ba3d4 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1715,7 +1715,6 @@ static const struct net_device_ops mcp251xfd_netdev_ops = { .ndo_stop = mcp251xfd_stop, .ndo_start_xmit = mcp251xfd_start_xmit, .ndo_eth_ioctl = can_eth_ioctl_hwts, - .ndo_change_mtu = can_change_mtu, }; static void diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c index 53bfd873de9b..6fcb301ef611 100644 --- a/drivers/net/can/sun4i_can.c +++ b/drivers/net/can/sun4i_can.c @@ -768,7 +768,6 @@ static const struct net_device_ops sun4ican_netdev_ops = { .ndo_open = sun4ican_open, .ndo_stop = sun4ican_close, .ndo_start_xmit = sun4ican_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops sun4ican_ethtool_ops = { diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c index e6d6661a908a..1d3dbf28b105 100644 --- a/drivers/net/can/ti_hecc.c +++ b/drivers/net/can/ti_hecc.c @@ -829,7 +829,6 @@ static const struct net_device_ops ti_hecc_netdev_ops = { .ndo_open = ti_hecc_open, .ndo_stop = ti_hecc_close, .ndo_start_xmit = ti_hecc_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops ti_hecc_ethtool_ops = { diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index 5355bac4dccb..de8e212a1366 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -885,7 +885,6 @@ static const struct net_device_ops ems_usb_netdev_ops = { .ndo_open = ems_usb_open, .ndo_stop = ems_usb_close, .ndo_start_xmit = ems_usb_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops ems_usb_ethtool_ops = { diff --git a/drivers/net/can/usb/esd_usb.c b/drivers/net/can/usb/esd_usb.c index 9bc1824d7be6..08da507faef4 100644 --- a/drivers/net/can/usb/esd_usb.c +++ b/drivers/net/can/usb/esd_usb.c @@ -1011,7 +1011,6 @@ static const struct net_device_ops esd_usb_netdev_ops = { .ndo_open = esd_usb_open, .ndo_stop = esd_usb_close, .ndo_start_xmit = esd_usb_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops esd_usb_ethtool_ops = { diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c index adc91873c083..47d9e03f3044 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.c +++ b/drivers/net/can/usb/etas_es58x/es58x_core.c @@ -1977,7 +1977,6 @@ static const struct net_device_ops es58x_netdev_ops = { .ndo_stop = es58x_stop, .ndo_start_xmit = es58x_start_xmit, .ndo_eth_ioctl = can_eth_ioctl_hwts, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops es58x_ethtool_ops = { diff --git a/drivers/net/can/usb/f81604.c b/drivers/net/can/usb/f81604.c index e0cfa1460b0b..efe61ece79ea 100644 --- a/drivers/net/can/usb/f81604.c +++ b/drivers/net/can/usb/f81604.c @@ -1052,7 +1052,6 @@ static const struct net_device_ops f81604_netdev_ops = { .ndo_open = f81604_open, .ndo_stop = f81604_close, .ndo_start_xmit = f81604_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct can_bittiming_const f81604_bittiming_const = { diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 69b8d6da651b..30608901a974 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -1101,7 +1101,6 @@ static const struct net_device_ops gs_usb_netdev_ops = { .ndo_open = gs_can_open, .ndo_stop = gs_can_close, .ndo_start_xmit = gs_can_start_xmit, - .ndo_change_mtu = can_change_mtu, .ndo_eth_ioctl = gs_can_eth_ioctl, }; diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index 90e77fa0ff4a..89e22b66f919 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -786,7 +786,6 @@ static const struct net_device_ops kvaser_usb_netdev_ops = { .ndo_stop = kvaser_usb_close, .ndo_eth_ioctl = can_eth_ioctl_hwts, .ndo_start_xmit = kvaser_usb_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops kvaser_usb_ethtool_ops = { diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index 1f9b915094e6..41c0a1c399bf 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -761,7 +761,6 @@ static const struct net_device_ops mcba_netdev_ops = { .ndo_open = mcba_usb_open, .ndo_stop = mcba_usb_close, .ndo_start_xmit = mcba_usb_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops mcba_ethtool_ops = { diff --git a/drivers/net/can/usb/nct6694_canfd.c b/drivers/net/can/usb/nct6694_canfd.c index 8deff16491a1..dd6df2ec3742 100644 --- a/drivers/net/can/usb/nct6694_canfd.c +++ b/drivers/net/can/usb/nct6694_canfd.c @@ -690,7 +690,6 @@ static const struct net_device_ops nct6694_canfd_netdev_ops = { .ndo_open = nct6694_canfd_open, .ndo_stop = nct6694_canfd_close, .ndo_start_xmit = nct6694_canfd_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops nct6694_canfd_ethtool_ops = { diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c index c74302ca7cee..94b1d7f15d27 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c @@ -814,7 +814,6 @@ static const struct net_device_ops peak_usb_netdev_ops = { .ndo_stop = peak_usb_ndo_stop, .ndo_eth_ioctl = peak_eth_ioctl, .ndo_start_xmit = peak_usb_ndo_start_xmit, - .ndo_change_mtu = can_change_mtu, }; /* CAN-USB devices generally handle 32-bit CAN channel IDs. diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c index 07406daf7c88..de61d9da99e3 100644 --- a/drivers/net/can/usb/ucan.c +++ b/drivers/net/can/usb/ucan.c @@ -1233,7 +1233,6 @@ static const struct net_device_ops ucan_netdev_ops = { .ndo_open = ucan_open, .ndo_stop = ucan_close, .ndo_start_xmit = ucan_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops ucan_ethtool_ops = { diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c index 8a5596ce4e46..7449328f7cd7 100644 --- a/drivers/net/can/usb/usb_8dev.c +++ b/drivers/net/can/usb/usb_8dev.c @@ -868,7 +868,6 @@ static const struct net_device_ops usb_8dev_netdev_ops = { .ndo_open = usb_8dev_open, .ndo_stop = usb_8dev_close, .ndo_start_xmit = usb_8dev_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops usb_8dev_ethtool_ops = { diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c index a25a3ca62c12..43d7f22820b8 100644 --- a/drivers/net/can/xilinx_can.c +++ b/drivers/net/can/xilinx_can.c @@ -1702,7 +1702,6 @@ static const struct net_device_ops xcan_netdev_ops = { .ndo_open = xcan_open, .ndo_stop = xcan_close, .ndo_start_xmit = xcan_start_xmit, - .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops xcan_ethtool_ops = { diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index a2229a61ccde..0fe8f80f223e 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -127,7 +127,6 @@ struct can_priv *safe_candev_priv(struct net_device *dev); int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); void can_set_default_mtu(struct net_device *dev); -int can_change_mtu(struct net_device *dev, int new_mtu); int __must_check can_set_static_ctrlmode(struct net_device *dev, u32 static_mode); int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd); -- cgit v1.2.3 From 0fbbcab7f9082cdc233da5e5e353f69830f11956 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Oct 2025 00:07:42 -0700 Subject: cgroup/misc: fix misc_res_type kernel-doc warning Format the kernel-doc for SCALE_HW_CALIB_INVALID correctly to avoid a kernel-doc warning: Warning: include/linux/misc_cgroup.h:26 Enum value 'MISC_CG_RES_TDX' not described in enum 'misc_res_type' Fixes: 7c035bea9407 ("KVM: TDX: Register TDX host key IDs to cgroup misc controller") Signed-off-by: Randy Dunlap Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 71cf5bfc6349..0cb36a3ffc47 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -19,7 +19,7 @@ enum misc_res_type { MISC_CG_RES_SEV_ES, #endif #ifdef CONFIG_INTEL_TDX_HOST - /* Intel TDX HKIDs resource */ + /** @MISC_CG_RES_TDX: Intel TDX HKIDs resource */ MISC_CG_RES_TDX, #endif /** @MISC_CG_RES_TYPES: count of enum misc_res_type constants */ -- cgit v1.2.3 From 0d30dae38fe01cd1de358c6039a0b1184689fe51 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Fri, 10 Oct 2025 13:52:54 +0800 Subject: HID: intel-ish-hid: Use dedicated unbound workqueues to prevent resume blocking During suspend/resume tests with S2IDLE, some ISH functional failures were observed because of delay in executing ISH resume handler. Here schedule_work() is used from resume handler to do actual work. schedule_work() uses system_wq, which is a per CPU work queue. Although the queuing is not bound to a CPU, but it prefers local CPU of the caller, unless prohibited. Users of this work queue are not supposed to queue long running work. But in practice, there are scenarios where long running work items are queued on other unbound workqueues, occupying the CPU. As a result, the ISH resume handler may not get a chance to execute in a timely manner. In one scenario, one of the ish_resume_handler() executions was delayed nearly 1 second because another work item on an unbound workqueue occupied the same CPU. This delay causes ISH functionality failures. A similar issue was previously observed where the ISH HID driver timed out while getting the HID descriptor during S4 resume in the recovery kernel, likely caused by the same workqueue contention problem. Create dedicated unbound workqueues for all ISH operations to allow work items to execute on any available CPU, eliminating CPU-specific bottlenecks and improving resume reliability under varying system loads. Also ISH has three different components, a bus driver which implements ISH protocols, a PCI interface layer and HID interface. Use one dedicated work queue for all of them. Signed-off-by: Zhang Lixu Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/ipc.c | 21 ++++++++++++++++++++- drivers/hid/intel-ish-hid/ipc/pci-ish.c | 2 +- drivers/hid/intel-ish-hid/ishtp-hid-client.c | 4 ++-- drivers/hid/intel-ish-hid/ishtp/bus.c | 18 +++++++++++++++++- drivers/hid/intel-ish-hid/ishtp/hbm.c | 4 ++-- drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h | 3 +++ include/linux/intel-ish-client-if.h | 2 ++ 7 files changed, 47 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/intel-ish-hid/ipc/ipc.c b/drivers/hid/intel-ish-hid/ipc/ipc.c index 3ddaa2cd39d5..9958f2968c4f 100644 --- a/drivers/hid/intel-ish-hid/ipc/ipc.c +++ b/drivers/hid/intel-ish-hid/ipc/ipc.c @@ -628,7 +628,7 @@ static void recv_ipc(struct ishtp_device *dev, uint32_t doorbell_val) if (!ishtp_dev) { ishtp_dev = dev; } - schedule_work(&fw_reset_work); + queue_work(dev->unbound_wq, &fw_reset_work); break; case MNG_RESET_NOTIFY_ACK: @@ -933,6 +933,21 @@ static const struct ishtp_hw_ops ish_hw_ops = { .dma_no_cache_snooping = _dma_no_cache_snooping }; +static struct workqueue_struct *devm_ishtp_alloc_workqueue(struct device *dev) +{ + struct workqueue_struct *wq; + + wq = alloc_workqueue("ishtp_unbound_%d", WQ_UNBOUND, 0, dev->id); + if (!wq) + return NULL; + + if (devm_add_action_or_reset(dev, (void (*)(void *))destroy_workqueue, + wq)) + return NULL; + + return wq; +} + /** * ish_dev_init() -Initialize ISH devoce * @pdev: PCI device @@ -953,6 +968,10 @@ struct ishtp_device *ish_dev_init(struct pci_dev *pdev) if (!dev) return NULL; + dev->unbound_wq = devm_ishtp_alloc_workqueue(&pdev->dev); + if (!dev->unbound_wq) + return NULL; + dev->devc = &pdev->dev; ishtp_device_init(dev); diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index 9d150ce234f2..b748ac6fbfdc 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -384,7 +384,7 @@ static int __maybe_unused ish_resume(struct device *device) ish_resume_device = device; dev->resume_flag = 1; - schedule_work(&resume_work); + queue_work(dev->unbound_wq, &resume_work); return 0; } diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c index d8c3c54a8c0f..f61add862b6b 100644 --- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c +++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c @@ -860,7 +860,7 @@ static int hid_ishtp_cl_reset(struct ishtp_cl_device *cl_device) hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__, hid_ishtp_cl); - schedule_work(&client_data->work); + queue_work(ishtp_get_workqueue(cl_device), &client_data->work); return 0; } @@ -902,7 +902,7 @@ static int hid_ishtp_cl_resume(struct device *device) hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__, hid_ishtp_cl); - schedule_work(&client_data->resume_work); + queue_work(ishtp_get_workqueue(cl_device), &client_data->resume_work); return 0; } diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c index 93a0432e7058..c6ce37244e49 100644 --- a/drivers/hid/intel-ish-hid/ishtp/bus.c +++ b/drivers/hid/intel-ish-hid/ishtp/bus.c @@ -541,7 +541,7 @@ void ishtp_cl_bus_rx_event(struct ishtp_cl_device *device) return; if (device->event_cb) - schedule_work(&device->event_work); + queue_work(device->ishtp_dev->unbound_wq, &device->event_work); } /** @@ -876,6 +876,22 @@ struct device *ishtp_get_pci_device(struct ishtp_cl_device *device) } EXPORT_SYMBOL(ishtp_get_pci_device); +/** + * ishtp_get_workqueue - Retrieve the workqueue associated with an ISHTP device + * @cl_device: Pointer to the ISHTP client device structure + * + * Returns the workqueue_struct pointer (unbound_wq) associated with the given + * ISHTP client device. This workqueue is typically used for scheduling work + * related to the device. + * + * Return: Pointer to struct workqueue_struct. + */ +struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device) +{ + return cl_device->ishtp_dev->unbound_wq; +} +EXPORT_SYMBOL(ishtp_get_workqueue); + /** * ishtp_trace_callback() - Return trace callback * @cl_device: ISH-TP client device instance diff --git a/drivers/hid/intel-ish-hid/ishtp/hbm.c b/drivers/hid/intel-ish-hid/ishtp/hbm.c index 8ee5467127d8..97c4fcd9e3c6 100644 --- a/drivers/hid/intel-ish-hid/ishtp/hbm.c +++ b/drivers/hid/intel-ish-hid/ishtp/hbm.c @@ -573,7 +573,7 @@ void ishtp_hbm_dispatch(struct ishtp_device *dev, /* Start firmware loading process if it has loader capability */ if (version_res->host_version_supported & ISHTP_SUPPORT_CAP_LOADER) - schedule_work(&dev->work_fw_loader); + queue_work(dev->unbound_wq, &dev->work_fw_loader); dev->version.major_version = HBM_MAJOR_VERSION; dev->version.minor_version = HBM_MINOR_VERSION; @@ -864,7 +864,7 @@ void recv_hbm(struct ishtp_device *dev, struct ishtp_msg_hdr *ishtp_hdr) dev->rd_msg_fifo_tail = (dev->rd_msg_fifo_tail + IPC_PAYLOAD_SIZE) % (RD_INT_FIFO_SIZE * IPC_PAYLOAD_SIZE); spin_unlock_irqrestore(&dev->rd_msg_spinlock, flags); - schedule_work(&dev->bh_hbm_work); + queue_work(dev->unbound_wq, &dev->bh_hbm_work); eoi: return; } diff --git a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h index 23db97ecf21c..4b0596eadf1c 100644 --- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h +++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h @@ -175,6 +175,9 @@ struct ishtp_device { struct hbm_version version; int transfer_path; /* Choice of transfer path: IPC or DMA */ + /* Alloc a dedicated unbound workqueue for ishtp device */ + struct workqueue_struct *unbound_wq; + /* work structure for scheduling firmware loading tasks */ struct work_struct work_fw_loader; /* waitq for waiting for command response from the firmware loader */ diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index dfbf7d9d7bb5..b235fd84f478 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -87,6 +87,8 @@ bool ishtp_wait_resume(struct ishtp_device *dev); ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device); /* Get device pointer of PCI device for DMA acces */ struct device *ishtp_get_pci_device(struct ishtp_cl_device *cl_device); +/* Get the ISHTP workqueue */ +struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device); struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device); void ishtp_cl_free(struct ishtp_cl *cl); -- cgit v1.2.3 From 011aa2aa2c4c2b3356c32f195f306df6e177ac38 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Fri, 17 Oct 2025 10:22:13 +0800 Subject: HID: intel-ish-hid: Add ishtp_get_connection_state() interface Add the ishtp_get_connection_state() function for struct ishtp_cl, allowing ishtp client drivers to retrieve the current connection state. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ishtp/client.c | 6 ++++++ include/linux/intel-ish-client-if.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/drivers/hid/intel-ish-hid/ishtp/client.c b/drivers/hid/intel-ish-hid/ishtp/client.c index 21a2c0773cc2..40f510b1c072 100644 --- a/drivers/hid/intel-ish-hid/ishtp/client.c +++ b/drivers/hid/intel-ish-hid/ishtp/client.c @@ -1261,6 +1261,12 @@ void ishtp_set_connection_state(struct ishtp_cl *cl, int state) } EXPORT_SYMBOL(ishtp_set_connection_state); +int ishtp_get_connection_state(struct ishtp_cl *cl) +{ + return cl->state; +} +EXPORT_SYMBOL(ishtp_get_connection_state); + void ishtp_cl_set_fw_client_id(struct ishtp_cl *cl, int fw_client_id) { cl->fw_client_id = fw_client_id; diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index b235fd84f478..2cd4f65aaa37 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -109,6 +109,7 @@ struct ishtp_device *ishtp_get_ishtp_device(struct ishtp_cl *cl); void ishtp_set_tx_ring_size(struct ishtp_cl *cl, int size); void ishtp_set_rx_ring_size(struct ishtp_cl *cl, int size); void ishtp_set_connection_state(struct ishtp_cl *cl, int state); +int ishtp_get_connection_state(struct ishtp_cl *cl); void ishtp_cl_set_fw_client_id(struct ishtp_cl *cl, int fw_client_id); void ishtp_put_device(struct ishtp_cl_device *cl_dev); -- cgit v1.2.3 From 1c17f4373d4db1e1f0ebd3ddcd8e7a642927a826 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 14 Oct 2025 22:42:07 +0000 Subject: ipv6: Move ipv6_fl_list from ipv6_pinfo to inet_sock. In {tcp6,udp6,raw6}_sock, struct ipv6_pinfo is always placed at the beginning of a new cache line because 1. __alignof__(struct tcp_sock) is 64 due to ____cacheline_aligned of __cacheline_group_begin(tcp_sock_write_tx) 2. __alignof__(struct udp_sock) is 64 due to ____cacheline_aligned of struct numa_drop_counters 3. in raw6_sock, struct numa_drop_counters is placed before struct ipv6_pinfo . struct ipv6_pinfo is 136 bytes, but the last cache line is only used by ipv6_fl_list: $ pahole -C ipv6_pinfo vmlinux struct ipv6_pinfo { ... /* --- cacheline 2 boundary (128 bytes) --- */ struct ipv6_fl_socklist * ipv6_fl_list; /* 128 8 */ /* size: 136, cachelines: 3, members: 23 */ Let's move ipv6_fl_list from struct ipv6_pinfo to struct inet_sock to save a full cache line for {tcp6,udp6,raw6}_sock. Now, struct ipv6_pinfo is 128 bytes, and {tcp6,udp6,raw6}_sock have 64 bytes less, while {tcp,udp,raw}_sock retain the same size. Before: # grep -E "^(RAW|UDP[^L\-]|TCP)" /proc/slabinfo | awk '{print $1, "\t", $4}' RAWv6 1408 UDPv6 1472 TCPv6 2560 RAW 1152 UDP 1280 TCP 2368 After: # grep -E "^(RAW|UDP[^L\-]|TCP)" /proc/slabinfo | awk '{print $1, "\t", $4}' RAWv6 1344 UDPv6 1408 TCPv6 2496 RAW 1152 UDP 1280 TCP 2368 Also, ipv6_fl_list and inet_flags (SNDFLOW bit) are placed in the same cache line. $ pahole -C inet_sock vmlinux ... /* --- cacheline 11 boundary (704 bytes) was 56 bytes ago --- */ struct ipv6_pinfo * pinet6; /* 760 8 */ /* --- cacheline 12 boundary (768 bytes) --- */ struct ipv6_fl_socklist * ipv6_fl_list; /* 768 8 */ unsigned long inet_flags; /* 776 8 */ Doc churn is due to the insufficient Type column (only 1 space short). Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251014224210.2964778-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/inet_sock.rst | 79 +++++++++++----------- .../chelsio/inline_crypto/chtls/chtls_cm.c | 4 +- include/linux/ipv6.h | 1 - include/net/inet_sock.h | 1 + net/ipv6/ip6_flowlabel.c | 44 ++++++------ net/ipv6/tcp_ipv6.c | 13 ++-- net/sctp/ipv6.c | 8 ++- 7 files changed, 76 insertions(+), 74 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/net_cachelines/inet_sock.rst b/Documentation/networking/net_cachelines/inet_sock.rst index b11bf48fa2b3..4c72a28a7012 100644 --- a/Documentation/networking/net_cachelines/inet_sock.rst +++ b/Documentation/networking/net_cachelines/inet_sock.rst @@ -5,42 +5,43 @@ inet_sock struct fast path usage breakdown ========================================== -======================= ===================== =================== =================== ====================================================================================================== -Type Name fastpath_tx_access fastpath_rx_access comment -======================= ===================== =================== =================== ====================================================================================================== -struct sock sk read_mostly read_mostly tcp_init_buffer_space,tcp_init_transfer,tcp_finish_connect,tcp_connect,tcp_send_rcvq,tcp_send_syn_data -struct ipv6_pinfo* pinet6 -be16 inet_sport read_mostly __tcp_transmit_skb -be32 inet_daddr read_mostly ip_select_ident_segs -be32 inet_rcv_saddr -be16 inet_dport read_mostly __tcp_transmit_skb -u16 inet_num -be32 inet_saddr -s16 uc_ttl read_mostly __ip_queue_xmit/ip_select_ttl -u16 cmsg_flags -struct ip_options_rcu* inet_opt read_mostly __ip_queue_xmit -u16 inet_id read_mostly ip_select_ident_segs -u8 tos read_mostly ip_queue_xmit -u8 min_ttl -u8 mc_ttl -u8 pmtudisc -u8:1 recverr -u8:1 is_icsk -u8:1 freebind -u8:1 hdrincl -u8:1 mc_loop -u8:1 transparent -u8:1 mc_all -u8:1 nodefrag -u8:1 bind_address_no_port -u8:1 recverr_rfc4884 -u8:1 defer_connect read_mostly tcp_sendmsg_fastopen -u8 rcv_tos -u8 convert_csum -int uc_index -int mc_index -be32 mc_addr -struct ip_mc_socklist* mc_list -struct inet_cork_full cork read_mostly __tcp_transmit_skb -struct local_port_range -======================= ===================== =================== =================== ====================================================================================================== +======================== ===================== =================== =================== ====================================================================================================== +Type Name fastpath_tx_access fastpath_rx_access comment +======================== ===================== =================== =================== ====================================================================================================== +struct sock sk read_mostly read_mostly tcp_init_buffer_space,tcp_init_transfer,tcp_finish_connect,tcp_connect,tcp_send_rcvq,tcp_send_syn_data +struct ipv6_pinfo* pinet6 +struct ipv6_fl_socklist* ipv6_fl_list read_mostly tcp_v6_connect,__ip6_datagram_connect,udpv6_sendmsg,rawv6_sendmsg +be16 inet_sport read_mostly __tcp_transmit_skb +be32 inet_daddr read_mostly ip_select_ident_segs +be32 inet_rcv_saddr +be16 inet_dport read_mostly __tcp_transmit_skb +u16 inet_num +be32 inet_saddr +s16 uc_ttl read_mostly __ip_queue_xmit/ip_select_ttl +u16 cmsg_flags +struct ip_options_rcu* inet_opt read_mostly __ip_queue_xmit +u16 inet_id read_mostly ip_select_ident_segs +u8 tos read_mostly ip_queue_xmit +u8 min_ttl +u8 mc_ttl +u8 pmtudisc +u8:1 recverr +u8:1 is_icsk +u8:1 freebind +u8:1 hdrincl +u8:1 mc_loop +u8:1 transparent +u8:1 mc_all +u8:1 nodefrag +u8:1 bind_address_no_port +u8:1 recverr_rfc4884 +u8:1 defer_connect read_mostly tcp_sendmsg_fastopen +u8 rcv_tos +u8 convert_csum +int uc_index +int mc_index +be32 mc_addr +struct ip_mc_socklist* mc_list +struct inet_cork_full cork read_mostly __tcp_transmit_skb +struct local_port_range +======================== ===================== =================== =================== ====================================================================================================== diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 4ee970f3bad6..ee0154337a9c 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1199,12 +1199,12 @@ static struct sock *chtls_recv_sock(struct sock *lsk, struct ipv6_pinfo *newnp = inet6_sk(newsk); struct ipv6_pinfo *np = inet6_sk(lsk); - inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + newinet->pinet6 = &newtcp6sk->inet6; + newinet->ipv6_fl_list = NULL; memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newsk->sk_v6_daddr = treq->ir_v6_rmt_addr; newsk->sk_v6_rcv_saddr = treq->ir_v6_loc_addr; inet6_sk(newsk)->saddr = treq->ir_v6_loc_addr; - newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newsk->sk_bound_dev_if = treq->ir_iif; newinet->inet_opt = NULL; diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 43b7bb828738..7294e4e89b79 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -271,7 +271,6 @@ struct ipv6_pinfo { struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; - struct ipv6_fl_socklist __rcu *ipv6_fl_list; }; /* We currently use available bits from inet_sk(sk)->inet_flags, diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 1086256549fa..b6ec08072533 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -214,6 +214,7 @@ struct inet_sock { struct sock sk; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *pinet6; + struct ipv6_fl_socklist __rcu *ipv6_fl_list; #endif /* Socket demultiplex comparisons on incoming packets. */ #define inet_daddr sk.__sk_common.skc_daddr diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index a3ff575798dd..60d0be47a9f3 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -66,8 +66,8 @@ EXPORT_SYMBOL(ipv6_flowlabel_exclusive); fl != NULL; \ fl = rcu_dereference(fl->next)) -#define for_each_sk_fl_rcu(np, sfl) \ - for (sfl = rcu_dereference(np->ipv6_fl_list); \ +#define for_each_sk_fl_rcu(sk, sfl) \ + for (sfl = rcu_dereference(inet_sk(sk)->ipv6_fl_list); \ sfl != NULL; \ sfl = rcu_dereference(sfl->next)) @@ -262,12 +262,11 @@ static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; - struct ipv6_pinfo *np = inet6_sk(sk); label &= IPV6_FLOWLABEL_MASK; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { @@ -283,16 +282,16 @@ EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { - struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ipv6_fl_socklist *sfl; - if (!rcu_access_pointer(np->ipv6_fl_list)) + if (!rcu_access_pointer(inet->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); - while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, + while ((sfl = rcu_dereference_protected(inet->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { - np->ipv6_fl_list = sfl->next; + inet->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); @@ -470,16 +469,15 @@ done: static int mem_check(struct sock *sk) { - struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_fl_socklist *sfl; int room = FL_MAX_SIZE - atomic_read(&fl_size); + struct ipv6_fl_socklist *sfl; int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) + for_each_sk_fl_rcu(sk, sfl) count++; rcu_read_unlock(); @@ -492,13 +490,15 @@ static int mem_check(struct sock *sk) return 0; } -static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, - struct ip6_flowlabel *fl) +static inline void fl_link(struct sock *sk, struct ipv6_fl_socklist *sfl, + struct ip6_flowlabel *fl) { + struct inet_sock *inet = inet_sk(sk); + spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; - sfl->next = np->ipv6_fl_list; - rcu_assign_pointer(np->ipv6_fl_list, sfl); + sfl->next = inet->ipv6_fl_list; + rcu_assign_pointer(inet->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } @@ -520,7 +520,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; @@ -559,7 +559,7 @@ static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) } spin_lock_bh(&ip6_sk_fl_lock); - for (sflp = &np->ipv6_fl_list; + for (sflp = &inet_sk(sk)->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) @@ -579,13 +579,12 @@ found: static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); @@ -614,7 +613,6 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; @@ -645,7 +643,7 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, if (freq->flr_label) { err = -EEXIST; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { rcu_read_unlock(); @@ -682,7 +680,7 @@ recheck: fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; - fl_link(np, sfl1, fl1); + fl_link(sk, sfl1, fl1); fl_free(fl); return 0; @@ -716,7 +714,7 @@ release: } } - fl_link(np, sfl1, fl); + fl_link(sk, sfl1, fl); return 0; done: fl_free(fl); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 59c4977a811a..6197dd4e6261 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1386,7 +1386,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (!newsk) return NULL; - inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); + newinet = inet_sk(newsk); + newinet->pinet6 = tcp_inet6_sk(newsk); + newinet->ipv6_fl_list = NULL; newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); @@ -1405,7 +1407,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet_iif(skb); @@ -1453,10 +1454,12 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newsk->sk_gso_type = SKB_GSO_TCPV6; inet6_sk_rx_dst_set(newsk, skb); - inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); + newinet = inet_sk(newsk); + newinet->pinet6 = tcp_inet6_sk(newsk); + newinet->ipv6_fl_list = NULL; + newinet->inet_opt = NULL; newtp = tcp_sk(newsk); - newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); @@ -1469,10 +1472,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * First: no IPv4 options. */ - newinet->inet_opt = NULL; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 568ff8797c39..d725b2158758 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -782,9 +782,10 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct sctp_association *asoc, bool kern) { - struct sock *newsk; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; + struct inet_sock *newinet; + struct sock *newsk; newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern); if (!newsk) @@ -796,7 +797,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, sock_reset_flag(sk, SOCK_ZAPPED); newsctp6sk = (struct sctp6_sock *)newsk; - inet_sk(newsk)->pinet6 = &newsctp6sk->inet6; + newinet = inet_sk(newsk); + newinet->pinet6 = &newsctp6sk->inet6; + newinet->ipv6_fl_list = NULL; sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped; @@ -805,7 +808,6 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; sctp_v6_copy_ip_options(sk, newsk); -- cgit v1.2.3 From 9c4609225ec1cb551006d6a03c7c4ad8cb5584c0 Mon Sep 17 00:00:00 2001 From: Xuanqiang Luo Date: Wed, 15 Oct 2025 10:02:34 +0800 Subject: rculist: Add hlist_nulls_replace_rcu() and hlist_nulls_replace_init_rcu() Add two functions to atomically replace RCU-protected hlist_nulls entries. Keep using WRITE_ONCE() to assign values to ->next and ->pprev, as mentioned in the patch below: commit efd04f8a8b45 ("rcu: Use WRITE_ONCE() for assignments to ->next for rculist_nulls") commit 860c8802ace1 ("rcu: Use WRITE_ONCE() for assignments to ->pprev for hlist_nulls") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Frederic Weisbecker Reviewed-by: Eric Dumazet Signed-off-by: Xuanqiang Luo Link: https://patch.msgid.link/20251015020236.431822-2-xuanqiang.luo@linux.dev Signed-off-by: Jakub Kicinski --- include/linux/rculist_nulls.h | 59 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 89186c499dd4..c26cb83ca071 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) #define hlist_nulls_next_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) +/** + * hlist_nulls_pprev_rcu - returns the dereferenced pprev of @node. + * @node: element of the list. + */ +#define hlist_nulls_pprev_rcu(node) \ + (*((struct hlist_nulls_node __rcu __force **)(node)->pprev)) + /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. @@ -152,6 +159,58 @@ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); } +/** + * hlist_nulls_replace_rcu - replace an old entry by a new one + * @old: the element to be replaced + * @new: the new element to insert + * + * Description: + * Replace the old entry with the new one in a RCU-protected hlist_nulls, while + * permitting racing traversals. + * + * The caller must take whatever precautions are necessary (such as holding + * appropriate locks) to avoid racing with another list-mutation primitive, such + * as hlist_nulls_add_head_rcu() or hlist_nulls_del_rcu(), running on this same + * list. However, it is perfectly legal to run concurrently with the _rcu + * list-traversal primitives, such as hlist_nulls_for_each_entry_rcu(). + */ +static inline void hlist_nulls_replace_rcu(struct hlist_nulls_node *old, + struct hlist_nulls_node *new) +{ + struct hlist_nulls_node *next = old->next; + + WRITE_ONCE(new->next, next); + WRITE_ONCE(new->pprev, old->pprev); + rcu_assign_pointer(hlist_nulls_pprev_rcu(new), new); + if (!is_a_nulls(next)) + WRITE_ONCE(next->pprev, &new->next); +} + +/** + * hlist_nulls_replace_init_rcu - replace an old entry by a new one and + * initialize the old + * @old: the element to be replaced + * @new: the new element to insert + * + * Description: + * Replace the old entry with the new one in a RCU-protected hlist_nulls, while + * permitting racing traversals, and reinitialize the old entry. + * + * Note: @old must be hashed. + * + * The caller must take whatever precautions are necessary (such as holding + * appropriate locks) to avoid racing with another list-mutation primitive, such + * as hlist_nulls_add_head_rcu() or hlist_nulls_del_rcu(), running on this same + * list. However, it is perfectly legal to run concurrently with the _rcu + * list-traversal primitives, such as hlist_nulls_for_each_entry_rcu(). + */ +static inline void hlist_nulls_replace_init_rcu(struct hlist_nulls_node *old, + struct hlist_nulls_node *new) +{ + hlist_nulls_replace_rcu(old, new); + WRITE_ONCE(old->pprev, NULL); +} + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. -- cgit v1.2.3 From d52bb3daad3f28403676dff31fa0577bdaf8e7c6 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sat, 18 Oct 2025 12:55:31 +0900 Subject: firewire: core: handle device quirk of TASCAM FW-1884/FW-1804/FW-1082 TASCAM FW-1884/FW-1804/FW-1082 is too lazy to repspond to asynchronous request at S400. The asynchronous transaction often results in timeout. This is a problematic quirk. This commit adds support for the quirk. When identifying the new quirk flag, then the transaction speed is configured at S200. Link: https://lore.kernel.org/r/20251018035532.287124-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-device.c | 18 +++++++++++++++++- include/linux/firewire.h | 3 +++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 6a5740ed4934..1674de477852 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -571,6 +571,14 @@ static const struct entry_match motu_audio_express_matches[] = { { 8, 0x17104800 }, }; +static const struct entry_match tascam_fw_series_matches[] = { + { 1, 0x0300022e }, + { 3, 0x8d000006 }, + { 4, 0xd1000001 }, + { 6, 0x1200022e }, + { 8, 0xd4000004 }, +}; + static int detect_quirks_by_root_directory(const u32 *root_directory, unsigned int length) { static const struct { @@ -583,6 +591,11 @@ static int detect_quirks_by_root_directory(const u32 *root_directory, unsigned i .matches = motu_audio_express_matches, .match_count = ARRAY_SIZE(motu_audio_express_matches), }, + { + .quirk = FW_DEVICE_QUIRK_UNSTABLE_AT_S400, + .matches = tascam_fw_series_matches, + .match_count = ARRAY_SIZE(tascam_fw_series_matches), + }, }; int quirks = 0; int i; @@ -761,7 +774,10 @@ static int read_config_rom(struct fw_device *device, int generation) // Just prevent from torn writing/reading. WRITE_ONCE(device->quirks, quirks); - speed = device->node->max_speed; + if (unlikely(quirks & FW_DEVICE_QUIRK_UNSTABLE_AT_S400)) + speed = SCODE_200; + else + speed = device->node->max_speed; // Determine the speed of // - devices with link speed less than PHY speed, diff --git a/include/linux/firewire.h b/include/linux/firewire.h index f1d8734c0ec6..6143b7d28eac 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -179,6 +179,9 @@ enum fw_device_quirk { // MOTU Audio Express transfers acknowledge packet with 0x10 for pending state. FW_DEVICE_QUIRK_ACK_PACKET_WITH_INVALID_PENDING_CODE = BIT(2), + + // TASCAM FW-1082/FW-1804/FW-1884 often freezes when receiving S400 packets. + FW_DEVICE_QUIRK_UNSTABLE_AT_S400 = BIT(3), }; enum fw_device_state { -- cgit v1.2.3 From b57100a3d9ced8c2b78e87d313f514a3338d016e Mon Sep 17 00:00:00 2001 From: Malaya Kumar Rout Date: Tue, 14 Oct 2025 01:00:27 +0530 Subject: PM: console: Fix memory allocation error handling in pm_vt_switch_required() The pm_vt_switch_required() function fails silently when memory allocation fails, offering no indication to callers that the operation was unsuccessful. This behavior prevents drivers from handling allocation errors correctly or implementing retry mechanisms. By ensuring that failures are reported back to the caller, drivers can make informed decisions, improve robustness, and avoid unexpected behavior during critical power management operations. Change the function signature to return an integer error code and modify the implementation to return -ENOMEM when kmalloc() fails. Update both the function declaration and the inline stub in include/linux/pm.h to maintain consistency across CONFIG_VT_CONSOLE_SLEEP configurations. The function now returns: - 0 on success (including when updating existing entries) - -ENOMEM when memory allocation fails This change improves error reporting without breaking existing callers, as the current callers in drivers/video/fbdev/core/fbmem.c already ignore the return value, making this a backward-compatible improvement. Reviewed-by: Lyude Paul Signed-off-by: Malaya Kumar Rout Reviewed-by: Dhruva Gole Reviewed-by: Lyude Paul Link: https://patch.msgid.link/20251013193028.89570-1-mrout@redhat.com Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 5 +++-- kernel/power/console.c | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index cc7b2dc28574..a72e42eec130 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -25,11 +25,12 @@ extern void (*pm_power_off)(void); struct device; /* we have a circular dep with device.h */ #ifdef CONFIG_VT_CONSOLE_SLEEP -extern void pm_vt_switch_required(struct device *dev, bool required); +extern int pm_vt_switch_required(struct device *dev, bool required); extern void pm_vt_switch_unregister(struct device *dev); #else -static inline void pm_vt_switch_required(struct device *dev, bool required) +static inline int pm_vt_switch_required(struct device *dev, bool required) { + return 0; } static inline void pm_vt_switch_unregister(struct device *dev) { diff --git a/kernel/power/console.c b/kernel/power/console.c index 19c48aa5355d..a906a0ac0f9b 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -44,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list); * no_console_suspend argument has been passed on the command line, VT * switches will occur. */ -void pm_vt_switch_required(struct device *dev, bool required) +int pm_vt_switch_required(struct device *dev, bool required) { struct pm_vt_switch *entry, *tmp; + int ret = 0; mutex_lock(&vt_switch_mutex); list_for_each_entry(tmp, &pm_vt_switch_list, head) { @@ -58,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required) } entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) + if (!entry) { + ret = -ENOMEM; goto out; + } entry->required = required; entry->dev = dev; @@ -67,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required) list_add(&entry->head, &pm_vt_switch_list); out: mutex_unlock(&vt_switch_mutex); + return ret; } EXPORT_SYMBOL(pm_vt_switch_required); -- cgit v1.2.3 From 8b9cd112f1ac8d72244b189654e693012ea8dfe0 Mon Sep 17 00:00:00 2001 From: André Draszik Date: Thu, 9 Oct 2025 10:31:27 +0100 Subject: soc: samsung: gs101-pmu: implement access tables for read and write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accessing non-existent PMU registers causes an SError, halting the system. Implement read and write access tables for the gs101-PMU to specify which registers are read- and/or writable to avoid that SError. Reviewed-by: Sam Protsenko Signed-off-by: André Draszik Link: https://patch.msgid.link/20251009-gs101-pmu-regmap-tables-v2-3-2d64f5261952@linaro.org Signed-off-by: Krzysztof Kozlowski --- drivers/soc/samsung/gs101-pmu.c | 306 ++++++++++++++++++++++++- include/linux/soc/samsung/exynos-regs-pmu.h | 343 +++++++++++++++++++++++++++- 2 files changed, 640 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/soc/samsung/gs101-pmu.c b/drivers/soc/samsung/gs101-pmu.c index ec345d09f21f..17dadc1b9c6e 100644 --- a/drivers/soc/samsung/gs101-pmu.c +++ b/drivers/soc/samsung/gs101-pmu.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "exynos-pmu.h" @@ -20,9 +21,312 @@ #define TENSOR_PMUREG_WRITE 1 #define TENSOR_PMUREG_RMW 2 +static const struct regmap_range gs101_pmu_registers[] = { + regmap_reg_range(GS101_OM_STAT, GS101_SYSTEM_INFO), + regmap_reg_range(GS101_IDLE_IP(0), GS101_IDLE_IP_MASK(3)), + regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(0), + GS101_PPMPURAM_INFORM_SCL_CH(3)), + regmap_reg_range(GS101_INFORM0, GS101_SYSIP_DAT(0)), + /* skip SYSIP_DAT1 SYSIP_DAT2 */ + regmap_reg_range(GS101_SYSIP_DAT(3), GS101_PWR_HOLD_SW_TRIP), + regmap_reg_range(GS101_GSA_INFORM(0), GS101_GSA_INFORM(1)), + regmap_reg_range(GS101_INFORM4, GS101_IROM_INFORM), + regmap_reg_range(GS101_IROM_CPU_INFORM(0), GS101_IROM_CPU_INFORM(7)), + regmap_reg_range(GS101_PMU_SPARE(0), GS101_PMU_SPARE(3)), + /* skip most IROM_xxx registers */ + regmap_reg_range(GS101_DREX_CALIBRATION(0), GS101_DREX_CALIBRATION(7)), + +#define CLUSTER_CPU_RANGE(cl, cpu) \ + regmap_reg_range(GS101_CLUSTER_CPU_CONFIGURATION(cl, cpu), \ + GS101_CLUSTER_CPU_OPTION(cl, cpu)), \ + regmap_reg_range(GS101_CLUSTER_CPU_OUT(cl, cpu), \ + GS101_CLUSTER_CPU_IN(cl, cpu)), \ + regmap_reg_range(GS101_CLUSTER_CPU_INT_IN(cl, cpu), \ + GS101_CLUSTER_CPU_INT_DIR(cl, cpu)) + + /* cluster 0..2 and cpu 0..4 or 0..1 */ + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 1), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 2), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 3), + CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 1), + CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 1), +#undef CLUSTER_CPU_RANGE + +#define CLUSTER_NONCPU_RANGE(cl) \ + regmap_reg_range(GS101_CLUSTER_NONCPU_CONFIGURATION(cl), \ + GS101_CLUSTER_NONCPU_OPTION(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_OUT(cl), \ + GS101_CLUSTER_NONCPU_IN(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_INT_IN(cl), \ + GS101_CLUSTER_NONCPU_INT_DIR(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_OUT(cl), \ + GS101_CLUSTER_NONCPU_DUALRAIL_POS_OUT(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl), \ + GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl)) + + CLUSTER_NONCPU_RANGE(0), + regmap_reg_range(GS101_CLUSTER0_NONCPU_DSU_PCH, + GS101_CLUSTER0_NONCPU_DSU_PCH), + CLUSTER_NONCPU_RANGE(1), + CLUSTER_NONCPU_RANGE(2), +#undef CLUSTER_NONCPU_RANGE + +#define SUBBLK_RANGE(blk) \ + regmap_reg_range(GS101_SUBBLK_CONFIGURATION(blk), \ + GS101_SUBBLK_CTRL(blk)), \ + regmap_reg_range(GS101_SUBBLK_OUT(blk), GS101_SUBBLK_IN(blk)), \ + regmap_reg_range(GS101_SUBBLK_INT_IN(blk), \ + GS101_SUBBLK_INT_DIR(blk)), \ + regmap_reg_range(GS101_SUBBLK_MEMORY_OUT(blk), \ + GS101_SUBBLK_MEMORY_IN(blk)) + + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ALIVE), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_AOC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_APM), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CMU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CORE), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EH), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_G3D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DPU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DISP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G2D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MFC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CSIS), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PDP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DNS), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3AA), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_IPP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ITP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MCSC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_GDC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TNR), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BO), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TPU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF3), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MISC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_S2D), +#undef SUBBLK_RANGE + +#define SUBBLK_CPU_RANGE(blk) \ + regmap_reg_range(GS101_SUBBLK_CPU_CONFIGURATION(blk), \ + GS101_SUBBLK_CPU_OPTION(blk)), \ + regmap_reg_range(GS101_SUBBLK_CPU_OUT(blk), \ + GS101_SUBBLK_CPU_IN(blk)), \ + regmap_reg_range(GS101_SUBBLK_CPU_INT_IN(blk), \ + GS101_SUBBLK_CPU_INT_DIR(blk)) + + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_APM), + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_DBGCORE), + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_SSS), +#undef SUBBLK_CPU_RANGE + + regmap_reg_range(GS101_MIF_CONFIGURATION, GS101_MIF_CTRL), + regmap_reg_range(GS101_MIF_OUT, GS101_MIF_IN), + regmap_reg_range(GS101_MIF_INT_IN, GS101_MIF_INT_DIR), + regmap_reg_range(GS101_TOP_CONFIGURATION, GS101_TOP_OPTION), + regmap_reg_range(GS101_TOP_OUT, GS101_TOP_IN), + regmap_reg_range(GS101_TOP_INT_IN, GS101_WAKEUP2_STAT), + regmap_reg_range(GS101_WAKEUP2_INT_IN, GS101_WAKEUP2_INT_DIR), + regmap_reg_range(GS101_SYSTEM_CONFIGURATION, GS101_USER_DEFINED_OUT), + regmap_reg_range(GS101_SYSTEM_OUT, GS101_SYSTEM_IN), + regmap_reg_range(GS101_SYSTEM_INT_IN, GS101_EINT_WAKEUP_MASK3), + regmap_reg_range(GS101_USER_DEFINED_INT_IN, GS101_SCAN2DRAM_INT_DIR), + /* skip HCU_START */ + regmap_reg_range(GS101_CUSTOM_OUT, GS101_CUSTOM_IN), + regmap_reg_range(GS101_CUSTOM_INT_IN, GS101_CUSTOM_INT_DIR), + regmap_reg_range(GS101_ACK_LAST_CPU, GS101_HCU_R(3)), + regmap_reg_range(GS101_HCU_SP, GS101_HCU_PC), + /* skip PMU_RAM_CTRL */ + regmap_reg_range(GS101_APM_HCU_CTRL, GS101_APM_HCU_CTRL), + regmap_reg_range(GS101_APM_NMI_ENABLE, GS101_RST_STAT_PMU), + regmap_reg_range(GS101_HPM_INT_IN, GS101_BOOT_STAT), + regmap_reg_range(GS101_PMLINK_OUT, GS101_PMLINK_AOC_CTRL), + regmap_reg_range(GS101_TCXO_BUF_CTRL, GS101_ADD_CTRL), + regmap_reg_range(GS101_HCU_TIMEOUT_RESET, GS101_HCU_TIMEOUT_SCAN2DRAM), + regmap_reg_range(GS101_TIMER(0), GS101_TIMER(3)), + regmap_reg_range(GS101_PPC_MIF(0), GS101_PPC_EH), + /* PPC_OFFSET, skip PPC_CPUCL1_0 PPC_CPUCL1_1 */ + regmap_reg_range(GS101_EXT_REGULATOR_MIF_DURATION, GS101_TCXO_DURATION), + regmap_reg_range(GS101_BURNIN_CTRL, GS101_TMU_SUB_TRIP), + regmap_reg_range(GS101_MEMORY_CEN, GS101_MEMORY_SMX_FEEDBACK), + regmap_reg_range(GS101_SLC_PCH_CHANNEL, GS101_SLC_PCH_CB), + regmap_reg_range(GS101_FORCE_NOMC, GS101_FORCE_NOMC), + regmap_reg_range(GS101_FORCE_BOOST, GS101_PMLINK_SLC_BUSY), + regmap_reg_range(GS101_BOOTSYNC_OUT, GS101_CTRL_SECJTAG_ALIVE), + regmap_reg_range(GS101_CTRL_DIV_PLL_ALV_DIVLOW, GS101_CTRL_CLKDIV__CLKRTC), + regmap_reg_range(GS101_CTRL_SOC32K, GS101_CTRL_SBU_SW_EN), + regmap_reg_range(GS101_PAD_CTRL_CLKOUT0, GS101_PAD_CTRL_WRESETO_n), + regmap_reg_range(GS101_PHY_CTRL_USB20, GS101_PHY_CTRL_UFS), +}; + +static const struct regmap_range gs101_pmu_ro_registers[] = { + regmap_reg_range(GS101_OM_STAT, GS101_VERSION), + regmap_reg_range(GS101_OTP_STATUS, GS101_OTP_STATUS), + + regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(0), + GS101_PPMPURAM_STATE_SLC_CH(0)), + regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(1), + GS101_PPMPURAM_STATE_SLC_CH(1)), + regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(2), + GS101_PPMPURAM_STATE_SLC_CH(2)), + regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(3), + GS101_PPMPURAM_STATE_SLC_CH(3)), + +#define CLUSTER_CPU_RANGE(cl, cpu) \ + regmap_reg_range(GS101_CLUSTER_CPU_IN(cl, cpu), \ + GS101_CLUSTER_CPU_IN(cl, cpu)), \ + regmap_reg_range(GS101_CLUSTER_CPU_INT_IN(cl, cpu), \ + GS101_CLUSTER_CPU_INT_IN(cl, cpu)) + + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 1), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 2), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 3), + CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 1), + CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 1), +#undef CLUSTER_CPU_RANGE + +#define CLUSTER_NONCPU_RANGE(cl) \ + regmap_reg_range(GS101_CLUSTER_NONCPU_IN(cl), \ + GS101_CLUSTER_NONCPU_IN(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_INT_IN(cl), \ + GS101_CLUSTER_NONCPU_INT_IN(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl), \ + GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl)) + + CLUSTER_NONCPU_RANGE(0), + CLUSTER_NONCPU_RANGE(1), + CLUSTER_NONCPU_RANGE(2), + regmap_reg_range(GS101_CLUSTER_NONCPU_INT_EN(2), + GS101_CLUSTER_NONCPU_INT_DIR(2)), +#undef CLUSTER_NONCPU_RANGE + +#define SUBBLK_RANGE(blk) \ + regmap_reg_range(GS101_SUBBLK_IN(blk), GS101_SUBBLK_IN(blk)), \ + regmap_reg_range(GS101_SUBBLK_INT_IN(blk), \ + GS101_SUBBLK_INT_IN(blk)), \ + regmap_reg_range(GS101_SUBBLK_MEMORY_IN(blk), \ + GS101_SUBBLK_MEMORY_IN(blk)) + + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ALIVE), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_AOC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_APM), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CMU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CORE), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EH), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_G3D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DPU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DISP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G2D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MFC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CSIS), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PDP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DNS), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3AA), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_IPP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ITP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MCSC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_GDC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TNR), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BO), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TPU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF3), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MISC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_S2D), +#undef SUBBLK_RANGE + +#define SUBBLK_CPU_RANGE(blk) \ + regmap_reg_range(GS101_SUBBLK_CPU_IN(blk), \ + GS101_SUBBLK_CPU_IN(blk)), \ + regmap_reg_range(GS101_SUBBLK_CPU_INT_IN(blk), \ + GS101_SUBBLK_CPU_INT_IN(blk)) + + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_APM), + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_DBGCORE), + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_SSS), +#undef SUBBLK_CPU_RANGE + + regmap_reg_range(GS101_MIF_CONFIGURATION, GS101_MIF_CONFIGURATION), + regmap_reg_range(GS101_MIF_IN, GS101_MIF_IN), + regmap_reg_range(GS101_MIF_INT_IN, GS101_MIF_INT_IN), + regmap_reg_range(GS101_TOP_IN, GS101_TOP_IN), + regmap_reg_range(GS101_TOP_INT_IN, GS101_TOP_INT_IN), + regmap_reg_range(GS101_WAKEUP2_INT_IN, GS101_WAKEUP2_INT_IN), + regmap_reg_range(GS101_SYSTEM_IN, GS101_SYSTEM_IN), + regmap_reg_range(GS101_SYSTEM_INT_IN, GS101_SYSTEM_INT_IN), + regmap_reg_range(GS101_EINT_INT_IN, GS101_EINT_INT_IN), + regmap_reg_range(GS101_EINT2_INT_IN, GS101_EINT2_INT_IN), + regmap_reg_range(GS101_EINT3_INT_IN, GS101_EINT3_INT_IN), + regmap_reg_range(GS101_USER_DEFINED_INT_IN, GS101_USER_DEFINED_INT_IN), + regmap_reg_range(GS101_SCAN2DRAM_INT_IN, GS101_SCAN2DRAM_INT_IN), + regmap_reg_range(GS101_CUSTOM_IN, GS101_CUSTOM_IN), + regmap_reg_range(GS101_CUSTOM_INT_IN, GS101_CUSTOM_INT_IN), + regmap_reg_range(GS101_HCU_R(0), GS101_HCU_R(3)), + regmap_reg_range(GS101_HCU_SP, GS101_HCU_PC), + regmap_reg_range(GS101_NMI_SRC_IN, GS101_NMI_SRC_IN), + regmap_reg_range(GS101_HPM_INT_IN, GS101_HPM_INT_IN), + regmap_reg_range(GS101_MEMORY_PGEN_FEEDBACK, GS101_MEMORY_PGEN_FEEDBACK), + regmap_reg_range(GS101_MEMORY_SMX_FEEDBACK, GS101_MEMORY_SMX_FEEDBACK), + regmap_reg_range(GS101_PMLINK_SLC_ACK, GS101_PMLINK_SLC_BUSY), + regmap_reg_range(GS101_BOOTSYNC_IN, GS101_BOOTSYNC_IN), + regmap_reg_range(GS101_SCAN_READY_IN, GS101_SCAN_READY_IN), + regmap_reg_range(GS101_CTRL_PLL_ALV_LOCK, GS101_CTRL_PLL_ALV_LOCK), +}; + +static const struct regmap_access_table gs101_pmu_rd_table = { + .yes_ranges = gs101_pmu_registers, + .n_yes_ranges = ARRAY_SIZE(gs101_pmu_registers), +}; + +static const struct regmap_access_table gs101_pmu_wr_table = { + .yes_ranges = gs101_pmu_registers, + .n_yes_ranges = ARRAY_SIZE(gs101_pmu_registers), + .no_ranges = gs101_pmu_ro_registers, + .n_no_ranges = ARRAY_SIZE(gs101_pmu_ro_registers), +}; + const struct exynos_pmu_data gs101_pmu_data = { .pmu_secure = true, .pmu_cpuhp = true, + .rd_table = &gs101_pmu_rd_table, + .wr_table = &gs101_pmu_wr_table, }; /* @@ -124,7 +428,7 @@ static bool tensor_is_atomic(unsigned int reg) return false; switch (reg) { - case GS101_SYSIP_DAT0: + case GS101_SYSIP_DAT(0): case GS101_SYSTEM_CONFIGURATION: return false; default: diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index 71e0c09a49eb..532c6c2d1195 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -672,14 +672,341 @@ /* For Tensor GS101 */ /* PMU ALIVE */ -#define GS101_SYSIP_DAT0 (0x810) -#define GS101_CPU0_INFORM (0x860) -#define GS101_CPU_INFORM(cpu) \ - (GS101_CPU0_INFORM + (cpu*4)) -#define GS101_SYSTEM_CONFIGURATION (0x3A00) -#define GS101_EINT_WAKEUP_MASK (0x3A80) -#define GS101_PHY_CTRL_USB20 (0x3EB0) -#define GS101_PHY_CTRL_USBDP (0x3EB4) +#define GS101_OM_STAT 0x0000 +#define GS101_VERSION 0x0004 +#define GS101_PORESET_CHECK 0x0008 +#define GS101_OTP_STATUS 0x000c +#define GS101_SYSTEM_INFO 0x0010 +#define GS101_IDLE_IP(n) (0x03e0 + ((n) & 3) * 4) +#define GS101_IDLE_IP_MASK(n) (0x03f0 + ((n) & 3) * 4) +#define GS101_SLC_CH_OFFSET(ch) (0x0400 + ((ch) & 3) * 0x10) +#define GS101_DATARAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x00) +#define GS101_TAGRAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x04) +#define GS101_LRURAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x08) +#define GS101_PPMPURAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x0c) +#define GS101_DATARAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x40) +#define GS101_TAGRAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x44) +#define GS101_LRURAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x48) +#define GS101_PPMPURAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x4c) +#define GS101_INFORM0 0x0800 +#define GS101_INFORM1 0x0804 +#define GS101_INFORM2 0x0808 +#define GS101_INFORM3 0x080c +#define GS101_SYSIP_DAT(n) (0x0810 + ((n) & 3) * 4) +#define GS101_PWR_HOLD_HW_TRIP 0x0820 +#define GS101_PWR_HOLD_SW_TRIP 0x0824 +#define GS101_GSA_INFORM(n) (0x0830 + ((n) & 1) * 4) +#define GS101_INFORM4 0x0840 +#define GS101_INFORM5 0x0844 +#define GS101_INFORM6 0x0848 +#define GS101_INFORM7 0x084c +#define GS101_INFORM8 0x0850 +#define GS101_INFORM9 0x0854 +#define GS101_INFORM10 0x0858 +#define GS101_INFORM11 0x085c +#define GS101_CPU_INFORM(cpu) (0x0860 + ((cpu) & 7) * 4) +#define GS101_IROM_INFORM 0x0880 +#define GS101_IROM_CPU_INFORM(cpu) (0x0890 + ((cpu) & 7) * 4) +#define GS101_PMU_SPARE(n) (0x0900 + ((n) & 3) * 4) +#define GS101_IROM_DATA_REG(n) (0x0980 + ((n) & 3) * 4) +#define GS101_IROM_PWRMODE 0x0990 +#define GS101_DREX_CALIBRATION(n) (0x09a0 + ((n) & 7) * 4) + +#define GS101_CLUSTER0_OFFSET 0x1000 +#define GS101_CLUSTER1_OFFSET 0x1300 +#define GS101_CLUSTER2_OFFSET 0x1500 +#define GS101_CLUSTER_CPU_OFFSET(cl, cpu) ((cl) + ((cpu) * 0x80)) +#define GS101_CLUSTER_CPU_CONFIGURATION(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x00) +#define GS101_CLUSTER_CPU_STATUS(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x04) +#define GS101_CLUSTER_CPU_STATES(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x08) +#define GS101_CLUSTER_CPU_OPTION(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x0c) +#define GS101_CLUSTER_CPU_OUT(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x20) +#define GS101_CLUSTER_CPU_IN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x24) +#define GS101_CLUSTER_CPU_INT_IN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x40) +#define GS101_CLUSTER_CPU_INT_EN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x44) +#define GS101_CLUSTER_CPU_INT_TYPE(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x48) +#define GS101_CLUSTER_CPU_INT_DIR(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x4c) + +#define GS101_CLUSTER_NONCPU_OFFSET(cl) (0x1200 + ((cl) * 0x200)) +#define GS101_CLUSTER_NONCPU_CONFIGURATION(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x00) +#define GS101_CLUSTER_NONCPU_STATUS(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x04) +#define GS101_CLUSTER_NONCPU_STATES(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x08) +#define GS101_CLUSTER_NONCPU_OPTION(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x0c) +#define GS101_CLUSTER_NONCPU_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x20) +#define GS101_CLUSTER_NONCPU_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x24) +#define GS101_CLUSTER_NONCPU_INT_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x40) +#define GS101_CLUSTER_NONCPU_INT_EN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x44) +#define GS101_CLUSTER_NONCPU_INT_TYPE(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x48) +#define GS101_CLUSTER_NONCPU_INT_DIR(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x4c) +#define GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x60) +#define GS101_CLUSTER_NONCPU_DUALRAIL_POS_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x64) +#define GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x6c) +#define GS101_CLUSTER0_NONCPU_DSU_PCH \ + (GS101_CLUSTER_NONCPU_OFFSET(0) + 0x80) + +#define GS101_SUBBBLK_OFFSET_ALIVE 0x1800 +#define GS101_SUBBBLK_OFFSET_AOC 0x1880 +#define GS101_SUBBBLK_OFFSET_APM 0x1900 +#define GS101_SUBBBLK_OFFSET_CMU 0x1980 +#define GS101_SUBBBLK_OFFSET_BUS0 0x1a00 +#define GS101_SUBBBLK_OFFSET_BUS1 0x1a80 +#define GS101_SUBBBLK_OFFSET_BUS2 0x1b00 +#define GS101_SUBBBLK_OFFSET_CORE 0x1b80 +#define GS101_SUBBBLK_OFFSET_EH 0x1c00 +#define GS101_SUBBBLK_OFFSET_CPUCL0 0x1c80 +#define GS101_SUBBBLK_OFFSET_CPUCL1 0x1d00 +#define GS101_SUBBBLK_OFFSET_CPUCL2 0x1d80 +#define GS101_SUBBBLK_OFFSET_G3D 0x1e00 +#define GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0 0x1e80 +#define GS101_SUBBBLK_OFFSET_EMBEDDED_G3D 0x2000 +#define GS101_SUBBBLK_OFFSET_HSI0 0x2080 +#define GS101_SUBBBLK_OFFSET_HSI1 0x2100 +#define GS101_SUBBBLK_OFFSET_HSI2 0x2180 +#define GS101_SUBBBLK_OFFSET_DPU 0x2200 +#define GS101_SUBBBLK_OFFSET_DISP 0x2280 +#define GS101_SUBBBLK_OFFSET_G2D 0x2300 +#define GS101_SUBBBLK_OFFSET_MFC 0x2380 +#define GS101_SUBBBLK_OFFSET_CSIS 0x2400 +#define GS101_SUBBBLK_OFFSET_PDP 0x2480 +#define GS101_SUBBBLK_OFFSET_DNS 0x2500 +#define GS101_SUBBBLK_OFFSET_G3AA 0x2580 +#define GS101_SUBBBLK_OFFSET_IPP 0x2600 +#define GS101_SUBBBLK_OFFSET_ITP 0x2680 +#define GS101_SUBBBLK_OFFSET_MCSC 0x2700 +#define GS101_SUBBBLK_OFFSET_GDC 0x2780 +#define GS101_SUBBBLK_OFFSET_TNR 0x2800 +#define GS101_SUBBBLK_OFFSET_BO 0x2880 +#define GS101_SUBBBLK_OFFSET_TPU 0x2900 +#define GS101_SUBBBLK_OFFSET_MIF0 0x2980 +#define GS101_SUBBBLK_OFFSET_MIF1 0x2a00 +#define GS101_SUBBBLK_OFFSET_MIF2 0x2a80 +#define GS101_SUBBBLK_OFFSET_MIF3 0x2b00 +#define GS101_SUBBBLK_OFFSET_MISC 0x2b80 +#define GS101_SUBBBLK_OFFSET_PERIC0 0x2c00 +#define GS101_SUBBBLK_OFFSET_PERIC1 0x2c80 +#define GS101_SUBBBLK_OFFSET_S2D 0x2d00 +#define GS101_SUBBLK_CONFIGURATION(blk) ((blk) + 0x00) +#define GS101_SUBBLK_STATUS(blk) ((blk) + 0x04) +#define GS101_SUBBLK_STATES(blk) ((blk) + 0x08) +#define GS101_SUBBLK_OPTION(blk) ((blk) + 0x0c) +#define GS101_SUBBLK_CTRL(blk) ((blk) + 0x10) +#define GS101_SUBBLK_OUT(blk) ((blk) + 0x20) +#define GS101_SUBBLK_IN(blk) ((blk) + 0x24) +#define GS101_SUBBLK_INT_IN(blk) ((blk) + 0x40) +#define GS101_SUBBLK_INT_EN(blk) ((blk) + 0x44) +#define GS101_SUBBLK_INT_TYPE(blk) ((blk) + 0x48) +#define GS101_SUBBLK_INT_DIR(blk) ((blk) + 0x4c) +#define GS101_SUBBLK_MEMORY_OUT(blk) ((blk) + 0x60) +#define GS101_SUBBLK_MEMORY_IN(blk) ((blk) + 0x64) + +#define GS101_SUBBBLK_CPU_OFFSET_APM 0x3000 +#define GS101_SUBBBLK_CPU_OFFSET_DBGCORE 0x3080 +#define GS101_SUBBBLK_CPU_OFFSET_SSS 0x3100 +#define GS101_SUBBLK_CPU_CONFIGURATION(blk) ((blk) + 0x00) +#define GS101_SUBBLK_CPU_STATUS(blk) ((blk) + 0x04) +#define GS101_SUBBLK_CPU_STATES(blk) ((blk) + 0x08) +#define GS101_SUBBLK_CPU_OPTION(blk) ((blk) + 0x0c) +#define GS101_SUBBLK_CPU_OUT(blk) ((blk) + 0x20) +#define GS101_SUBBLK_CPU_IN(blk) ((blk) + 0x24) +#define GS101_SUBBLK_CPU_INT_IN(blk) ((blk) + 0x40) +#define GS101_SUBBLK_CPU_INT_EN(blk) ((blk) + 0x44) +#define GS101_SUBBLK_CPU_INT_TYPE(blk) ((blk) + 0x48) +#define GS101_SUBBLK_CPU_INT_DIR(blk) ((blk) + 0x4c) + +#define GS101_MIF_CONFIGURATION 0x3800 +#define GS101_MIF_STATUS 0x3804 +#define GS101_MIF_STATES 0x3808 +#define GS101_MIF_OPTION 0x380c +#define GS101_MIF_CTRL 0x3810 +#define GS101_MIF_OUT 0x3820 +#define GS101_MIF_IN 0x3824 +#define GS101_MIF_INT_IN 0x3840 +#define GS101_MIF_INT_EN 0x3844 +#define GS101_MIF_INT_TYPE 0x3848 +#define GS101_MIF_INT_DIR 0x384c +#define GS101_TOP_CONFIGURATION 0x3900 +#define GS101_TOP_STATUS 0x3904 +#define GS101_TOP_STATES 0x3908 +#define GS101_TOP_OPTION 0x390c +#define GS101_TOP_OUT 0x3920 +#define GS101_TOP_IN 0x3924 +#define GS101_TOP_INT_IN 0x3940 +#define GS101_TOP_INT_EN 0x3944 +#define GS101_TOP_INT_TYPE 0x3948 +#define GS101_TOP_INT_DIR 0x394c +#define GS101_WAKEUP_STAT 0x3950 +#define GS101_WAKEUP2_STAT 0x3954 +#define GS101_WAKEUP2_INT_IN 0x3960 +#define GS101_WAKEUP2_INT_EN 0x3964 +#define GS101_WAKEUP2_INT_TYPE 0x3968 +#define GS101_WAKEUP2_INT_DIR 0x396c +#define GS101_SYSTEM_CONFIGURATION 0x3a00 +#define GS101_SYSTEM_STATUS 0x3a04 +#define GS101_SYSTEM_STATES 0x3a08 +#define GS101_SYSTEM_OPTION 0x3a0c +#define GS101_SYSTEM_CTRL 0x3a10 +#define GS101_SPARE_CTRL 0x3a14 +#define GS101_USER_DEFINED_OUT 0x3a18 +#define GS101_SYSTEM_OUT 0x3a20 +#define GS101_SYSTEM_IN 0x3a24 +#define GS101_SYSTEM_INT_IN 0x3a40 +#define GS101_SYSTEM_INT_EN 0x3a44 +#define GS101_SYSTEM_INT_TYPE 0x3a48 +#define GS101_SYSTEM_INT_DIR 0x3a4c +#define GS101_EINT_INT_IN 0x3a50 +#define GS101_EINT_INT_EN 0x3a54 +#define GS101_EINT_INT_TYPE 0x3a58 +#define GS101_EINT_INT_DIR 0x3a5c +#define GS101_EINT2_INT_IN 0x3a60 +#define GS101_EINT2_INT_EN 0x3a64 +#define GS101_EINT2_INT_TYPE 0x3a68 +#define GS101_EINT2_INT_DIR 0x3a6c +#define GS101_EINT3_INT_IN 0x3a70 +#define GS101_EINT3_INT_EN 0x3a74 +#define GS101_EINT3_INT_TYPE 0x3a78 +#define GS101_EINT3_INT_DIR 0x3a7c +#define GS101_EINT_WAKEUP_MASK 0x3a80 +#define GS101_EINT_WAKEUP_MASK2 0x3a84 +#define GS101_EINT_WAKEUP_MASK3 0x3a88 +#define GS101_USER_DEFINED_INT_IN 0x3a90 +#define GS101_USER_DEFINED_INT_EN 0x3a94 +#define GS101_USER_DEFINED_INT_TYPE 0x3a98 +#define GS101_USER_DEFINED_INT_DIR 0x3a9c +#define GS101_SCAN2DRAM_INT_IN 0x3aa0 +#define GS101_SCAN2DRAM_INT_EN 0x3aa4 +#define GS101_SCAN2DRAM_INT_TYPE 0x3aa8 +#define GS101_SCAN2DRAM_INT_DIR 0x3aac +#define GS101_HCU_START 0x3ab0 +#define GS101_CUSTOM_OUT 0x3ac0 +#define GS101_CUSTOM_IN 0x3ac4 +#define GS101_CUSTOM_INT_IN 0x3ad0 +#define GS101_CUSTOM_INT_EN 0x3ad4 +#define GS101_CUSTOM_INT_TYPE 0x3ad8 +#define GS101_CUSTOM_INT_DIR 0x3adc +#define GS101_ACK_LAST_CPU 0x3afc +#define GS101_HCU_R(n) (0x3b00 + ((n) & 3) * 4) +#define GS101_HCU_SP 0x3b14 +#define GS101_HCU_PC 0x3b18 +#define GS101_PMU_RAM_CTRL 0x3b20 +#define GS101_APM_HCU_CTRL 0x3b24 +#define GS101_APM_NMI_ENABLE 0x3b30 +#define GS101_DBGCORE_NMI_ENABLE 0x3b34 +#define GS101_HCU_NMI_ENABLE 0x3b38 +#define GS101_PWR_HOLD_WDT_ENABLE 0x3b3c +#define GS101_NMI_SRC_IN 0x3b40 +#define GS101_RST_STAT 0x3b44 +#define GS101_RST_STAT_PMU 0x3b48 +#define GS101_HPM_INT_IN 0x3b60 +#define GS101_HPM_INT_EN 0x3b64 +#define GS101_HPM_INT_TYPE 0x3b68 +#define GS101_HPM_INT_DIR 0x3b6c +#define GS101_S2D_AUTH 0x3b70 +#define GS101_BOOT_STAT 0x3b74 +#define GS101_PMLINK_OUT 0x3c00 +#define GS101_PMLINK_AOC_OUT 0x3c04 +#define GS101_PMLINK_AOC_CTRL 0x3c08 +#define GS101_TCXO_BUF_CTRL 0x3c10 +#define GS101_ADD_CTRL 0x3c14 +#define GS101_HCU_TIMEOUT_RESET 0x3c20 +#define GS101_HCU_TIMEOUT_SCAN2DRAM 0x3c24 +#define GS101_TIMER(n) (0x3c80 + ((n) & 3) * 4) +#define GS101_PPC_MIF(n) (0x3c90 + ((n) & 3) * 4) +#define GS101_PPC_CORE 0x3ca0 +#define GS101_PPC_EH 0x3ca4 +#define GS101_PPC_CPUCL1_0 0x3ca8 +#define GS101_PPC_CPUCL1_1 0x3cac +#define GS101_EXT_REGULATOR_MIF_DURATION 0x3cb0 +#define GS101_EXT_REGULATOR_TOP_DURATION 0x3cb4 +#define GS101_EXT_REGULATOR_CPUCL2_DURATION 0x3cb8 +#define GS101_EXT_REGULATOR_CPUCL1_DURATION 0x3cbc +#define GS101_EXT_REGULATOR_G3D_DURATION 0x3cc0 +#define GS101_EXT_REGULATOR_TPU_DURATION 0x3cc4 +#define GS101_TCXO_DURATION 0x3cc8 +#define GS101_BURNIN_CTRL 0x3cd0 +#define GS101_JTAG_DBG_DET 0x3cd4 +#define GS101_MMC_CONWKUP_CTRL 0x3cd8 +#define GS101_USBDPPHY0_USBDP_WAKEUP 0x3cdc +#define GS101_TMU_TOP_TRIP 0x3ce0 +#define GS101_TMU_SUB_TRIP 0x3ce4 +#define GS101_MEMORY_CEN 0x3d00 +#define GS101_MEMORY_PGEN 0x3d04 +#define GS101_MEMORY_RET 0x3d08 +#define GS101_MEMORY_PGEN_FEEDBACK 0x3d0c +#define GS101_MEMORY_SMX 0x3d10 +#define GS101_MEMORY_SMX_FEEDBACK 0x3d14 +#define GS101_SLC_PCH_CHANNEL 0x3d20 +#define GS101_SLC_PCH_CB 0x3d24 +#define GS101_FORCE_NOMC 0x3d3c +#define GS101_FORCE_BOOST 0x3d4c +#define GS101_PMLINK_SLC_REQ 0x3d50 +#define GS101_PMLINK_SLC_ACK 0x3d54 +#define GS101_PMLINK_SLC_BUSY 0x3d58 +#define GS101_BOOTSYNC_OUT 0x3d80 +#define GS101_BOOTSYNC_IN 0x3d84 +#define GS101_SCAN_READY_OUT 0x3d88 +#define GS101_SCAN_READY_IN 0x3d8c +#define GS101_GSA_RESTORE 0x3d90 +#define GS101_ALIVE_OTP_LATCH 0x3d94 +#define GS101_DEBUG_OVERRIDE 0x3d98 +#define GS101_WDT_OPTION 0x3d9c +#define GS101_AOC_WDT_CFG 0x3da0 +#define GS101_CTRL_SECJTAG_ALIVE 0x3da4 +#define GS101_CTRL_DIV_PLL_ALV_DIVLOW 0x3e00 +#define GS101_CTRL_MUX_CLK_APM_REFSRC_AUTORESTORE 0x3e04 +#define GS101_CTRL_MUX_CLK_APM_REFSRC 0x3e08 +#define GS101_CTRL_MUX_CLK_APM_REF 0x3e0c +#define GS101_CTRL_MUX_PLL_ALV_DIV4 0x3e10 +#define GS101_CTRL_PLL_ALV_DIV4 0x3e14 +#define GS101_CTRL_OSCCLK_APMGSA 0x3e18 +#define GS101_CTRL_BLK_AOC_CLKS 0x3e1c +#define GS101_CTRL_PLL_ALV_LOCK 0x3e20 +#define GS101_CTRL_CLKDIV__CLKRTC 0x3e24 +#define GS101_CTRL_SOC32K 0x3e30 +#define GS101_CTRL_STM_PMU 0x3e34 +#define GS101_CTRL_PMU_DEBUG 0x3e38 +#define GS101_CTRL_DEBUG_UART 0x3e3c +#define GS101_CTRL_TCK 0x3e40 +#define GS101_CTRL_SBU_SW_EN 0x3e44 +#define GS101_PAD_CTRL_CLKOUT0 0x3e80 +#define GS101_PAD_CTRL_CLKOUT1 0x3e84 +#define GS101_PAD_CTRL_APM_24MOUT_0 0x3e88 +#define GS101_PAD_CTRL_APM_24MOUT_1 0x3e8c +#define GS101_PAD_CTRL_IO_FORCE_RETENTION 0x3e90 +#define GS101_PAD_CTRL_APACTIVE_n 0x3e94 +#define GS101_PAD_CTRL_TCXO_ON 0x3e98 +#define GS101_PAD_CTRL_PWR_HOLD 0x3e9c +#define GS101_PAD_CTRL_RESETO_n 0x3ea0 +#define GS101_PAD_CTRL_WRESETO_n 0x3ea4 +#define GS101_PHY_CTRL_USB20 0x3eb0 +#define GS101_PHY_CTRL_USBDP 0x3eb4 +#define GS101_PHY_CTRL_MIPI_DCPHY_M4M4 0x3eb8 +#define GS101_PHY_CTRL_MIPI_DCPHY_S4S4S4S4 0x3ebc +#define GS101_PHY_CTRL_PCIE_GEN4_0 0x3ec0 +#define GS101_PHY_CTRL_PCIE_GEN4_1 0x3ec4 +#define GS101_PHY_CTRL_UFS 0x3ec8 /* PMU INTR GEN */ #define GS101_GRP1_INTR_BID_UPEND (0x0108) -- cgit v1.2.3 From edd548dc64a699d71ea4f537f815044e763d01e1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Oct 2025 12:13:23 -0700 Subject: firmware: qcom: tzmem: fix qcom_tzmem_policy kernel-doc Fix kernel-doc warnings by using correct kernel-doc syntax and formatting to prevent warnings: Warning: include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value 'QCOM_TZMEM_POLICY_STATIC' not described in enum 'qcom_tzmem_policy' Warning: ../include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value 'QCOM_TZMEM_POLICY_MULTIPLIER' not described in enum 'qcom_tzmem_policy' Warning: ../include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value 'QCOM_TZMEM_POLICY_ON_DEMAND' not described in enum 'qcom_tzmem_policy' Fixes: 84f5a7b67b61 ("firmware: qcom: add a dedicated TrustZone buffer allocator") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20251017191323.1820167-1-rdunlap@infradead.org Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_tzmem.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/qcom/qcom_tzmem.h b/include/linux/firmware/qcom/qcom_tzmem.h index 48ac0e5454c7..23173e0c3ddd 100644 --- a/include/linux/firmware/qcom/qcom_tzmem.h +++ b/include/linux/firmware/qcom/qcom_tzmem.h @@ -17,11 +17,20 @@ struct qcom_tzmem_pool; * enum qcom_tzmem_policy - Policy for pool growth. */ enum qcom_tzmem_policy { - /**< Static pool, never grow above initial size. */ + /** + * @QCOM_TZMEM_POLICY_STATIC: Static pool, + * never grow above initial size. + */ QCOM_TZMEM_POLICY_STATIC = 1, - /**< When out of memory, add increment * current size of memory. */ + /** + * @QCOM_TZMEM_POLICY_MULTIPLIER: When out of memory, + * add increment * current size of memory. + */ QCOM_TZMEM_POLICY_MULTIPLIER, - /**< When out of memory add as much as is needed until max_size. */ + /** + * @QCOM_TZMEM_POLICY_ON_DEMAND: When out of memory + * add as much as is needed until max_size. + */ QCOM_TZMEM_POLICY_ON_DEMAND, }; -- cgit v1.2.3 From aa653654ee67f9cbbebb7d4c18f360ad4fef3180 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 11 Oct 2025 09:48:55 +0800 Subject: rhashtable: use likely for rhashtable lookup Sometimes, the result of the rhashtable_lookup() is expected to be found. Therefore, we can use likely() for such cases. Following new functions are introduced, which will use likely or unlikely during the lookup: rhashtable_lookup_likely rhltable_lookup_likely A micro-benchmark is made for these new functions: lookup a existed entry repeatedly for 100000000 times, and rhashtable_lookup_likely() gets ~30% speedup. Signed-off-by: Menglong Dong Signed-off-by: Herbert Xu --- include/linux/rhashtable.h | 70 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 05a221ce79a6..08e664b21f5a 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -355,12 +355,25 @@ static inline void rht_unlock(struct bucket_table *tbl, local_irq_restore(flags); } -static inline struct rhash_head *__rht_ptr( - struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) +enum rht_lookup_freq { + RHT_LOOKUP_NORMAL, + RHT_LOOKUP_LIKELY, +}; + +static __always_inline struct rhash_head *__rht_ptr( + struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt, + const enum rht_lookup_freq freq) { - return (struct rhash_head *) - ((unsigned long)p & ~BIT(0) ?: - (unsigned long)RHT_NULLS_MARKER(bkt)); + unsigned long p_val = (unsigned long)p & ~BIT(0); + + BUILD_BUG_ON(!__builtin_constant_p(freq)); + + if (freq == RHT_LOOKUP_LIKELY) + return (struct rhash_head *) + (likely(p_val) ? p_val : (unsigned long)RHT_NULLS_MARKER(bkt)); + else + return (struct rhash_head *) + (p_val ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } /* @@ -370,10 +383,17 @@ static inline struct rhash_head *__rht_ptr( * rht_ptr_exclusive() dereferences in a context where exclusive * access is guaranteed, such as when destroying the table. */ +static __always_inline struct rhash_head *__rht_ptr_rcu( + struct rhash_lock_head __rcu *const *bkt, + const enum rht_lookup_freq freq) +{ + return __rht_ptr(rcu_dereference_all(*bkt), bkt, freq); +} + static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { - return __rht_ptr(rcu_dereference_all(*bkt), bkt); + return __rht_ptr_rcu(bkt, RHT_LOOKUP_NORMAL); } static inline struct rhash_head *rht_ptr( @@ -381,13 +401,15 @@ static inline struct rhash_head *rht_ptr( struct bucket_table *tbl, unsigned int hash) { - return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); + return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt, + RHT_LOOKUP_NORMAL); } static inline struct rhash_head *rht_ptr_exclusive( struct rhash_lock_head __rcu *const *bkt) { - return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); + return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt, + RHT_LOOKUP_NORMAL); } static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, @@ -588,7 +610,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, /* Internal function, do not use. */ static __always_inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, - const struct rhashtable_params params) + const struct rhashtable_params params, + const enum rht_lookup_freq freq) { struct rhashtable_compare_arg arg = { .ht = ht, @@ -599,12 +622,13 @@ static __always_inline struct rhash_head *__rhashtable_lookup( struct rhash_head *he; unsigned int hash; + BUILD_BUG_ON(!__builtin_constant_p(freq)); tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { - rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { + rht_for_each_rcu_from(he, __rht_ptr_rcu(bkt, freq), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) @@ -643,11 +667,22 @@ static __always_inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { - struct rhash_head *he = __rhashtable_lookup(ht, key, params); + struct rhash_head *he = __rhashtable_lookup(ht, key, params, + RHT_LOOKUP_NORMAL); return he ? rht_obj(ht, he) : NULL; } +static __always_inline void *rhashtable_lookup_likely( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(ht, key, params, + RHT_LOOKUP_LIKELY); + + return likely(he) ? rht_obj(ht, he) : NULL; +} + /** * rhashtable_lookup_fast - search hash table, without RCU read lock * @ht: hash table @@ -693,11 +728,22 @@ static __always_inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { - struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params, + RHT_LOOKUP_NORMAL); return he ? container_of(he, struct rhlist_head, rhead) : NULL; } +static __always_inline struct rhlist_head *rhltable_lookup_likely( + struct rhltable *hlt, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params, + RHT_LOOKUP_LIKELY); + + return likely(he) ? container_of(he, struct rhlist_head, rhead) : NULL; +} + /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes if there is a clash, * otherwise it returns an error via ERR_PTR(). -- cgit v1.2.3 From 84a222d1b369ba83f8947948670f775367e653f1 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 10 Oct 2025 12:46:32 +0000 Subject: firmware: exynos-acpm: add DVFS protocol Add ACPM DVFS protocol handler. It constructs DVFS messages that the APM firmware can understand. Signed-off-by: Tudor Ambarus Reviewed-by: Peter Griffin Tested-by: Peter Griffin # on gs101-oriole Link: https://patch.msgid.link/20251010-acpm-clk-v6-2-321ee8826fd4@linaro.org Signed-off-by: Krzysztof Kozlowski --- drivers/firmware/samsung/Makefile | 4 +- drivers/firmware/samsung/exynos-acpm-dvfs.c | 80 ++++++++++++++++++++++ drivers/firmware/samsung/exynos-acpm-dvfs.h | 21 ++++++ drivers/firmware/samsung/exynos-acpm.c | 5 ++ .../linux/firmware/samsung/exynos-acpm-protocol.h | 10 +++ 5 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 drivers/firmware/samsung/exynos-acpm-dvfs.c create mode 100644 drivers/firmware/samsung/exynos-acpm-dvfs.h (limited to 'include/linux') diff --git a/drivers/firmware/samsung/Makefile b/drivers/firmware/samsung/Makefile index 7b4c9f6f34f5..80d4f89b33a9 100644 --- a/drivers/firmware/samsung/Makefile +++ b/drivers/firmware/samsung/Makefile @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -acpm-protocol-objs := exynos-acpm.o exynos-acpm-pmic.o +acpm-protocol-objs := exynos-acpm.o +acpm-protocol-objs += exynos-acpm-pmic.o +acpm-protocol-objs += exynos-acpm-dvfs.o obj-$(CONFIG_EXYNOS_ACPM_PROTOCOL) += acpm-protocol.o diff --git a/drivers/firmware/samsung/exynos-acpm-dvfs.c b/drivers/firmware/samsung/exynos-acpm-dvfs.c new file mode 100644 index 000000000000..1c5b2b143bcc --- /dev/null +++ b/drivers/firmware/samsung/exynos-acpm-dvfs.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2020 Samsung Electronics Co., Ltd. + * Copyright 2020 Google LLC. + * Copyright 2025 Linaro Ltd. + */ + +#include +#include +#include +#include +#include + +#include "exynos-acpm.h" +#include "exynos-acpm-dvfs.h" + +#define ACPM_DVFS_ID GENMASK(11, 0) +#define ACPM_DVFS_REQ_TYPE GENMASK(15, 0) + +#define ACPM_DVFS_FREQ_REQ 0 +#define ACPM_DVFS_FREQ_GET 1 + +static void acpm_dvfs_set_xfer(struct acpm_xfer *xfer, u32 *cmd, size_t cmdlen, + unsigned int acpm_chan_id, bool response) +{ + xfer->acpm_chan_id = acpm_chan_id; + xfer->txd = cmd; + xfer->txlen = cmdlen; + + if (response) { + xfer->rxd = cmd; + xfer->rxlen = cmdlen; + } +} + +static void acpm_dvfs_init_set_rate_cmd(u32 cmd[4], unsigned int clk_id, + unsigned long rate) +{ + cmd[0] = FIELD_PREP(ACPM_DVFS_ID, clk_id); + cmd[1] = rate / HZ_PER_KHZ; + cmd[2] = FIELD_PREP(ACPM_DVFS_REQ_TYPE, ACPM_DVFS_FREQ_REQ); + cmd[3] = ktime_to_ms(ktime_get()); +} + +int acpm_dvfs_set_rate(const struct acpm_handle *handle, + unsigned int acpm_chan_id, unsigned int clk_id, + unsigned long rate) +{ + struct acpm_xfer xfer = {0}; + u32 cmd[4]; + + acpm_dvfs_init_set_rate_cmd(cmd, clk_id, rate); + acpm_dvfs_set_xfer(&xfer, cmd, sizeof(cmd), acpm_chan_id, false); + + return acpm_do_xfer(handle, &xfer); +} + +static void acpm_dvfs_init_get_rate_cmd(u32 cmd[4], unsigned int clk_id) +{ + cmd[0] = FIELD_PREP(ACPM_DVFS_ID, clk_id); + cmd[2] = FIELD_PREP(ACPM_DVFS_REQ_TYPE, ACPM_DVFS_FREQ_GET); + cmd[3] = ktime_to_ms(ktime_get()); +} + +unsigned long acpm_dvfs_get_rate(const struct acpm_handle *handle, + unsigned int acpm_chan_id, unsigned int clk_id) +{ + struct acpm_xfer xfer; + unsigned int cmd[4] = {0}; + int ret; + + acpm_dvfs_init_get_rate_cmd(cmd, clk_id); + acpm_dvfs_set_xfer(&xfer, cmd, sizeof(cmd), acpm_chan_id, true); + + ret = acpm_do_xfer(handle, &xfer); + if (ret) + return 0; + + return xfer.rxd[1] * HZ_PER_KHZ; +} diff --git a/drivers/firmware/samsung/exynos-acpm-dvfs.h b/drivers/firmware/samsung/exynos-acpm-dvfs.h new file mode 100644 index 000000000000..9f2778e649c9 --- /dev/null +++ b/drivers/firmware/samsung/exynos-acpm-dvfs.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2020 Samsung Electronics Co., Ltd. + * Copyright 2020 Google LLC. + * Copyright 2025 Linaro Ltd. + */ +#ifndef __EXYNOS_ACPM_DVFS_H__ +#define __EXYNOS_ACPM_DVFS_H__ + +#include + +struct acpm_handle; + +int acpm_dvfs_set_rate(const struct acpm_handle *handle, + unsigned int acpm_chan_id, unsigned int id, + unsigned long rate); +unsigned long acpm_dvfs_get_rate(const struct acpm_handle *handle, + unsigned int acpm_chan_id, + unsigned int clk_id); + +#endif /* __EXYNOS_ACPM_DVFS_H__ */ diff --git a/drivers/firmware/samsung/exynos-acpm.c b/drivers/firmware/samsung/exynos-acpm.c index 3a69fe3234c7..9fa0335ccf5d 100644 --- a/drivers/firmware/samsung/exynos-acpm.c +++ b/drivers/firmware/samsung/exynos-acpm.c @@ -29,6 +29,7 @@ #include #include "exynos-acpm.h" +#include "exynos-acpm-dvfs.h" #include "exynos-acpm-pmic.h" #define ACPM_PROTOCOL_SEQNUM GENMASK(21, 16) @@ -590,8 +591,12 @@ static int acpm_channels_init(struct acpm_info *acpm) */ static void acpm_setup_ops(struct acpm_info *acpm) { + struct acpm_dvfs_ops *dvfs_ops = &acpm->handle.ops.dvfs_ops; struct acpm_pmic_ops *pmic_ops = &acpm->handle.ops.pmic_ops; + dvfs_ops->set_rate = acpm_dvfs_set_rate; + dvfs_ops->get_rate = acpm_dvfs_get_rate; + pmic_ops->read_reg = acpm_pmic_read_reg; pmic_ops->bulk_read = acpm_pmic_bulk_read; pmic_ops->write_reg = acpm_pmic_write_reg; diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index f628bf1862c2..b1e95435240f 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -13,6 +13,15 @@ struct acpm_handle; struct device_node; +struct acpm_dvfs_ops { + int (*set_rate)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, unsigned int clk_id, + unsigned long rate); + unsigned long (*get_rate)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, + unsigned int clk_id); +}; + struct acpm_pmic_ops { int (*read_reg)(const struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, @@ -32,6 +41,7 @@ struct acpm_pmic_ops { }; struct acpm_ops { + struct acpm_dvfs_ops dvfs_ops; struct acpm_pmic_ops pmic_ops; }; -- cgit v1.2.3 From 7f3779a3ac3e474d043f0a2b77dd6e6bb020c577 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 27 Aug 2025 17:52:43 +0000 Subject: mm/filemap: Add NUMA mempolicy support to filemap_alloc_folio() Add a mempolicy parameter to filemap_alloc_folio() to enable NUMA-aware page cache allocations. This will be used by upcoming changes to support NUMA policies in guest-memfd, where guest_memory need to be allocated NUMA policy specified by VMM. All existing users pass NULL maintaining current behavior. Reviewed-by: Pankaj Gupta Reviewed-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/r/20250827175247.83322-4-shivankg@amd.com Signed-off-by: Sean Christopherson --- fs/btrfs/compression.c | 4 ++-- fs/btrfs/verity.c | 2 +- fs/erofs/zdata.c | 2 +- fs/f2fs/compress.c | 2 +- include/linux/pagemap.h | 8 +++++--- mm/filemap.c | 14 +++++++++----- mm/readahead.c | 2 +- 7 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bacad18357b3..d927ae32e7d0 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -491,8 +491,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, - ~__GFP_FS), 0); + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS), + 0, NULL); if (!folio) break; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 46bd8ca58670..d4523d5debcd 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -742,7 +742,7 @@ again: } folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS), - 0); + 0, NULL); if (!folio) return ERR_PTR(-ENOMEM); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index bc80cfe482f7..b7369fb4fbe9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -562,7 +562,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) * Allocate a managed folio for cached I/O, or it may be * then filled with a file-backed folio for in-place I/O */ - newfolio = filemap_alloc_folio(gfp, 0); + newfolio = filemap_alloc_folio(gfp, 0, NULL); if (!newfolio) continue; newfolio->private = Z_EROFS_PREALLOCATED_FOLIO; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6ad8d3bc6df7..a65e8cd388bc 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1947,7 +1947,7 @@ static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, return; } - cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0); + cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0, NULL); if (!cfolio) return; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..f1d0610210f7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -654,9 +654,11 @@ static inline void *detach_page_private(struct page *page) } #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy); #else -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { return folio_alloc_noprof(gfp, order); } @@ -667,7 +669,7 @@ static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int o static inline struct page *__page_cache_alloc(gfp_t gfp) { - return &filemap_alloc_folio(gfp, 0)->page; + return &filemap_alloc_folio(gfp, 0, NULL)->page; } static inline gfp_t readahead_gfp_mask(struct address_space *x) diff --git a/mm/filemap.c b/mm/filemap.c index 13f0259d993c..7b42fd6dcc9a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1002,11 +1002,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { int n; struct folio *folio; + if (policy) + return folio_alloc_mpol_noprof(gfp, order, policy, + NO_INTERLEAVE_INDEX, numa_node_id()); + if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { @@ -2009,7 +2014,7 @@ no_page: err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; - folio = filemap_alloc_folio(alloc_gfp, order); + folio = filemap_alloc_folio(alloc_gfp, order, NULL); if (!folio) continue; @@ -2551,7 +2556,7 @@ static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL); if (!folio) return -ENOMEM; if (iocb->ki_flags & IOCB_DONTCACHE) @@ -3983,8 +3988,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { - folio = filemap_alloc_folio(gfp, - mapping_min_folio_order(mapping)); + folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL); if (!folio) return ERR_PTR(-ENOMEM); index = mapping_align_index(mapping, index); diff --git a/mm/readahead.c b/mm/readahead.c index 3a4b5d58eeb6..b415c9969176 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -186,7 +186,7 @@ static struct folio *ractl_alloc_folio(struct readahead_control *ractl, { struct folio *folio; - folio = filemap_alloc_folio(gfp_mask, order); + folio = filemap_alloc_folio(gfp_mask, order, NULL); if (folio && ractl->dropbehind) __folio_set_dropbehind(folio); -- cgit v1.2.3 From 16a542e22339cd5e73e56a956bbd335c7bd7c08c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 27 Aug 2025 17:52:44 +0000 Subject: mm/filemap: Extend __filemap_get_folio() to support NUMA memory policies Extend __filemap_get_folio() to support NUMA memory policies by renaming the implementation to __filemap_get_folio_mpol() and adding a mempolicy parameter. The original function becomes a static inline wrapper that passes NULL for the mempolicy. This infrastructure will enable future support for NUMA-aware page cache allocations in guest_memfd memory backend KVM guests. Reviewed-by: Pankaj Gupta Reviewed-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/r/20250827175247.83322-5-shivankg@amd.com Signed-off-by: Sean Christopherson --- include/linux/pagemap.h | 10 ++++++++-- mm/filemap.c | 11 ++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f1d0610210f7..a17fabbc0269 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -755,11 +755,17 @@ static inline fgf_t fgf_set_order(size_t size) } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp); +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp, struct mempolicy *policy); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); +static inline struct folio *__filemap_get_folio(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp) +{ + return __filemap_get_folio_mpol(mapping, index, fgf_flags, gfp, NULL); +} + /** * write_begin_get_folio - Get folio for write_begin with flags. * @iocb: The kiocb passed from write_begin (may be NULL). diff --git a/mm/filemap.c b/mm/filemap.c index 7b42fd6dcc9a..91c4537283d3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1928,11 +1928,12 @@ out: } /** - * __filemap_get_folio - Find and get a reference to a folio. + * __filemap_get_folio_mpol - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. + * @policy: NUMA memory allocation policy to follow. * * Looks up the page cache entry at @mapping & @index. * @@ -1943,8 +1944,8 @@ out: * * Return: The found folio or an ERR_PTR() otherwise. */ -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp) +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy) { struct folio *folio; @@ -2014,7 +2015,7 @@ no_page: err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; - folio = filemap_alloc_folio(alloc_gfp, order, NULL); + folio = filemap_alloc_folio(alloc_gfp, order, policy); if (!folio) continue; @@ -2061,7 +2062,7 @@ no_page: folio_clear_dropbehind(folio); return folio; } -EXPORT_SYMBOL(__filemap_get_folio); +EXPORT_SYMBOL(__filemap_get_folio_mpol); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) -- cgit v1.2.3 From 7be20254a743be4f02414b9d56cc3fe5f84e6500 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Sep 2025 04:25:22 -0600 Subject: io_uring: unify task_work cancelation checks Rather than do per-tw checking, which needs to dip into the task_struct for checking flags, do it upfront before running task_work. This places a 'cancel' member in io_tw_token_t, which is assigned before running task_work for that given ctx. This is both more efficient in doing it upfront rather than for every task_work, and it means that io_should_terminate_tw() can be made private in io_uring.c rather than need to be called by various callbacks of task_work. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 27 ++++++++++++++++++++------- io_uring/io_uring.h | 13 ------------- io_uring/poll.c | 2 +- io_uring/timeout.c | 2 +- io_uring/uring_cmd.c | 2 +- 6 files changed, 24 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index c2ea6280901d..25ee982eb435 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -474,6 +474,7 @@ struct io_ring_ctx { * ONLY core io_uring.c should instantiate this struct. */ struct io_tw_state { + bool cancel; }; /* Alias to use in code that doesn't instantiate struct io_tw_state */ typedef struct io_tw_state io_tw_token_t; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 820ef0527666..c397118da85e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -265,6 +265,20 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } +/* + * Terminate the request if either of these conditions are true: + * + * 1) It's being executed by the original task, but that task is marked + * with PF_EXITING as it's exiting. + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is + * our fallback task_work. + * 3) The ring has been closed and is going away. + */ +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) +{ + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); +} + static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, @@ -275,8 +289,10 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); - llist_for_each_entry_safe(req, tmp, node, io_task_work.node) + llist_for_each_entry_safe(req, tmp, node, io_task_work.node) { + ts.cancel = io_should_terminate_tw(req->ctx); req->io_task_work.func(req, ts); + } io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -1147,6 +1163,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, ctx = req->ctx; mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); + ts.cancel = io_should_terminate_tw(ctx); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, @@ -1205,11 +1222,6 @@ struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, { struct llist_node *node; - if (unlikely(current->flags & PF_EXITING)) { - io_fallback_tw(tctx, true); - return NULL; - } - node = llist_del_all(&tctx->task_list); if (node) { node = llist_reverse_order(node); @@ -1399,6 +1411,7 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: + tw.cancel = io_should_terminate_tw(ctx); min_events -= ret; ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); if (ctx->retry_llist.first) @@ -1458,7 +1471,7 @@ void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, tw); - if (unlikely(io_should_terminate_tw(ctx))) + if (unlikely(tw.cancel)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 46d9141d772a..78777bf1ea4b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -558,19 +558,6 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) ctx->submitter_task == current); } -/* - * Terminate the request if either of these conditions are true: - * - * 1) It's being executed by the original task, but that task is marked - * with PF_EXITING as it's exiting. - * 2) PF_KTHREAD is set, in which case the invoker of the task_work is - * our fallback task_work. - */ -static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) -{ - return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs); -} - static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) { io_req_set_res(req, res, 0); diff --git a/io_uring/poll.c b/io_uring/poll.c index b9681d0f9f13..c403e751841a 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -224,7 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; - if (unlikely(io_should_terminate_tw(req->ctx))) + if (unlikely(tw.cancel)) return -ECANCELED; do { diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 17e3aab0af36..444142ba9d04 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -324,7 +324,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) int ret; if (prev) { - if (!io_should_terminate_tw(req->ctx)) { + if (!tw.cancel) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index d1e3ba62ee8e..1225f8124e4b 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -118,7 +118,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; - if (io_should_terminate_tw(req->ctx)) + if (unlikely(tw.cancel)) flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ -- cgit v1.2.3 From c5ebcc80fcf7d2c6ed917371f024d2da5bce9128 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Oct 2025 00:07:27 -0700 Subject: iio: adc: qcom-vadc-common: fix vadc_scale_fn_type kernel-doc Fix multiple warnings in enum vadc_scale_fn_type by adding a leading '@' to the kernel-doc descriptions. Fixed 14 warnings in this one enum, such as: Warning: include/linux/iio/adc/qcom-vadc-common.h:123 Enum value 'SCALE_DEFAULT' not described in enum 'vadc_scale_fn_type' Warning: ../include/linux/iio/adc/qcom-vadc-common.h:123 Enum value 'SCALE_THERM_100K_PULLUP' not described in enum 'vadc_scale_fn_type' Warning: ../include/linux/iio/adc/qcom-vadc-common.h:123 Enum value 'SCALE_PMIC_THERM' not described in enum 'vadc_scale_fn_type' Also prevent the warning on SCALE_HW_CALIB_INVALID by marking it "private:" so that kernel-doc notation is not needed for it. This leaves only one warning here, which I don't know the appropriate description of: qcom-vadc-common.h:125: warning: Enum value 'SCALE_HW_CALIB_PMIC_THERM_PM7' not described in enum 'vadc_scale_fn_type' Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/qcom-vadc-common.h | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/adc/qcom-vadc-common.h b/include/linux/iio/adc/qcom-vadc-common.h index aa21b032e861..3bf4c49726a7 100644 --- a/include/linux/iio/adc/qcom-vadc-common.h +++ b/include/linux/iio/adc/qcom-vadc-common.h @@ -83,27 +83,27 @@ struct vadc_linear_graph { /** * enum vadc_scale_fn_type - Scaling function to convert ADC code to * physical scaled units for the channel. - * SCALE_DEFAULT: Default scaling to convert raw adc code to voltage (uV). - * SCALE_THERM_100K_PULLUP: Returns temperature in millidegC. + * @SCALE_DEFAULT: Default scaling to convert raw adc code to voltage (uV). + * @SCALE_THERM_100K_PULLUP: Returns temperature in millidegC. * Uses a mapping table with 100K pullup. - * SCALE_PMIC_THERM: Returns result in milli degree's Centigrade. - * SCALE_XOTHERM: Returns XO thermistor voltage in millidegC. - * SCALE_PMI_CHG_TEMP: Conversion for PMI CHG temp - * SCALE_HW_CALIB_DEFAULT: Default scaling to convert raw adc code to + * @SCALE_PMIC_THERM: Returns result in milli degree's Centigrade. + * @SCALE_XOTHERM: Returns XO thermistor voltage in millidegC. + * @SCALE_PMI_CHG_TEMP: Conversion for PMI CHG temp + * @SCALE_HW_CALIB_DEFAULT: Default scaling to convert raw adc code to * voltage (uV) with hardware applied offset/slope values to adc code. - * SCALE_HW_CALIB_THERM_100K_PULLUP: Returns temperature in millidegC using + * @SCALE_HW_CALIB_THERM_100K_PULLUP: Returns temperature in millidegC using * lookup table. The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_XOTHERM: Returns XO thermistor voltage in millidegC using + * @SCALE_HW_CALIB_XOTHERM: Returns XO thermistor voltage in millidegC using * 100k pullup. The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_THERM_100K_PU_PM7: Returns temperature in millidegC using + * @SCALE_HW_CALIB_THERM_100K_PU_PM7: Returns temperature in millidegC using * lookup table for PMIC7. The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. + * @SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. * The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. + * @SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. * The hardware applies offset/slope to adc code. This is for PMIC7. - * SCALE_HW_CALIB_PM5_CHG_TEMP: Returns result in millidegrees for PMIC5 + * @SCALE_HW_CALIB_PM5_CHG_TEMP: Returns result in millidegrees for PMIC5 * charger temperature. - * SCALE_HW_CALIB_PM5_SMB_TEMP: Returns result in millidegrees for PMIC5 + * @SCALE_HW_CALIB_PM5_SMB_TEMP: Returns result in millidegrees for PMIC5 * SMB1390 temperature. */ enum vadc_scale_fn_type { @@ -120,6 +120,7 @@ enum vadc_scale_fn_type { SCALE_HW_CALIB_PMIC_THERM_PM7, SCALE_HW_CALIB_PM5_CHG_TEMP, SCALE_HW_CALIB_PM5_SMB_TEMP, + /* private: */ SCALE_HW_CALIB_INVALID, }; -- cgit v1.2.3 From ca82a7ea2299b4586af1f77daee66ee781202320 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 19 Sep 2025 14:42:50 -0700 Subject: iomap: simplify iomap_iter_advance() Most callers of iomap_iter_advance() do not need the remaining length returned. Get rid of the extra iomap_length() call that iomap_iter_advance() does. Signed-off-by: Joanne Koong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/dax.c | 30 ++++++++++++------------------ fs/iomap/buffered-io.c | 18 +++++++++--------- fs/iomap/direct-io.c | 6 +++--- fs/iomap/iter.c | 14 +++++--------- fs/iomap/seek.c | 8 ++++---- include/linux/iomap.h | 6 ++---- 6 files changed, 35 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/fs/dax.c b/fs/dax.c index 89f071ba7b10..c299fcb5618d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1507,7 +1507,7 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); /* * invalidate the pages whose sharing state is to be changed @@ -1536,10 +1536,10 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (ret < 0) return ret; - ret = iomap_iter_advance(iter, &length); + ret = iomap_iter_advance(iter, length); if (ret) return ret; - } while (length > 0); + } while ((length = iomap_length(iter)) > 0); if (did_zero) *did_zero = true; @@ -1597,7 +1597,7 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { done = iov_iter_zero(min(length, end - pos), iter); - return iomap_iter_advance(iomi, &done); + return iomap_iter_advance(iomi, done); } } @@ -1681,12 +1681,12 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); - length = xfer; - ret = iomap_iter_advance(iomi, &length); + ret = iomap_iter_advance(iomi, xfer); if (!ret && xfer == 0) ret = -EFAULT; if (xfer < map_len) break; + length = iomap_length(iomi); } dax_read_unlock(id); @@ -1919,10 +1919,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp, ret |= VM_FAULT_MAJOR; } - if (!(ret & VM_FAULT_ERROR)) { - u64 length = PAGE_SIZE; - iter.status = iomap_iter_advance(&iter, &length); - } + if (!(ret & VM_FAULT_ERROR)) + iter.status = iomap_iter_advance(&iter, PAGE_SIZE); } if (iomap_errp) @@ -2034,10 +2032,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp, continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); - if (ret != VM_FAULT_FALLBACK) { - u64 length = PMD_SIZE; - iter.status = iomap_iter_advance(&iter, &length); - } + if (ret != VM_FAULT_FALLBACK) + iter.status = iomap_iter_advance(&iter, PMD_SIZE); } unlock_entry: @@ -2163,7 +2159,6 @@ static int dax_range_compare_iter(struct iomap_iter *it_src, const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; - u64 dest_len; void *saddr, *daddr; int id, ret; @@ -2196,10 +2191,9 @@ static int dax_range_compare_iter(struct iomap_iter *it_src, dax_read_unlock(id); advance: - dest_len = len; - ret = iomap_iter_advance(it_src, &len); + ret = iomap_iter_advance(it_src, len); if (!ret) - ret = iomap_iter_advance(it_dest, &dest_len); + ret = iomap_iter_advance(it_dest, len); return ret; out_unlock: diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8b847a1e27f1..6cc2ee44bbca 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -376,7 +376,7 @@ static int iomap_readpage_iter(struct iomap_iter *iter, ret = iomap_read_inline_data(iter, folio); if (ret) return ret; - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } /* zero post-eof blocks as the page may be mapped */ @@ -437,7 +437,7 @@ done: * iteration. */ length = pos - iter->pos + plen; - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } static int iomap_read_folio_iter(struct iomap_iter *iter, @@ -1041,7 +1041,7 @@ retry: } } else { total_written += written; - iomap_iter_advance(iter, &written); + iomap_iter_advance(iter, written); } } while (iov_iter_count(i) && iomap_length(iter)); @@ -1310,7 +1310,7 @@ static int iomap_unshare_iter(struct iomap_iter *iter, int status; if (!iomap_want_unshare_iter(iter)) - return iomap_iter_advance(iter, &bytes); + return iomap_iter_advance(iter, bytes); do { struct folio *folio; @@ -1334,10 +1334,10 @@ static int iomap_unshare_iter(struct iomap_iter *iter, balance_dirty_pages_ratelimited(iter->inode->i_mapping); - status = iomap_iter_advance(iter, &bytes); + status = iomap_iter_advance(iter, bytes); if (status) break; - } while (bytes > 0); + } while ((bytes = iomap_length(iter)) > 0); return status; } @@ -1412,10 +1412,10 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, if (WARN_ON_ONCE(!ret)) return -EIO; - status = iomap_iter_advance(iter, &bytes); + status = iomap_iter_advance(iter, bytes); if (status) break; - } while (bytes > 0); + } while ((bytes = iomap_length(iter)) > 0); if (did_zero) *did_zero = true; @@ -1526,7 +1526,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, folio_mark_dirty(folio); } - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 5d5d63efbd57..e9e5f0703160 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -496,7 +496,7 @@ out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) - return iomap_iter_advance(iter, &copied); + return iomap_iter_advance(iter, copied); return ret; } @@ -507,7 +507,7 @@ static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) dio->size += length; if (!length) return -EFAULT; - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) @@ -542,7 +542,7 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) dio->size += copied; if (!copied) return -EFAULT; - return iomap_iter_advance(iomi, &copied); + return iomap_iter_advance(iomi, copied); } static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index cef77ca0c20b..91d2024e00da 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -13,17 +13,13 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) memset(&iter->srcmap, 0, sizeof(iter->srcmap)); } -/* - * Advance the current iterator position and output the length remaining for the - * current mapping. - */ -int iomap_iter_advance(struct iomap_iter *iter, u64 *count) +/* Advance the current iterator position and decrement the remaining length */ +int iomap_iter_advance(struct iomap_iter *iter, u64 count) { - if (WARN_ON_ONCE(*count > iomap_length(iter))) + if (WARN_ON_ONCE(count > iomap_length(iter))) return -EIO; - iter->pos += *count; - iter->len -= *count; - *count = iomap_length(iter); + iter->pos += count; + iter->len -= count; return 0; } diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 56db2dd4b10d..6cbc587c93da 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -16,13 +16,13 @@ static int iomap_seek_hole_iter(struct iomap_iter *iter, *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_HOLE); if (*hole_pos == iter->pos + length) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); return 0; case IOMAP_HOLE: *hole_pos = iter->pos; return 0; default: - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } } @@ -59,12 +59,12 @@ static int iomap_seek_data_iter(struct iomap_iter *iter, switch (iter->iomap.type) { case IOMAP_HOLE: - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); case IOMAP_UNWRITTEN: *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_DATA); if (*hole_pos < 0) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); return 0; default: *hole_pos = iter->pos; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 73dceabc21c8..4469b2318b08 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -245,7 +245,7 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); -int iomap_iter_advance(struct iomap_iter *iter, u64 *count); +int iomap_iter_advance(struct iomap_iter *iter, u64 count); /** * iomap_length_trim - trimmed length of the current iomap iteration @@ -282,9 +282,7 @@ static inline u64 iomap_length(const struct iomap_iter *iter) */ static inline int iomap_iter_advance_full(struct iomap_iter *iter) { - u64 length = iomap_length(iter); - - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, iomap_length(iter)); } /** -- cgit v1.2.3 From dc816f8d925cac34922ea73abd94ae23a96cacac Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 1 Oct 2025 01:53:14 +0200 Subject: fs: assert ->i_lock held in __iget() Also remove the now redundant comment. Signed-off-by: Mateusz Guzik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..ac62b9d10b00 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3378,11 +3378,9 @@ static inline bool is_zero_ino(ino_t ino) return (u32)ino == 0; } -/* - * inode->i_lock must be held - */ static inline void __iget(struct inode *inode) { + lockdep_assert_held(&inode->i_lock); atomic_inc(&inode->i_count); } -- cgit v1.2.3 From 31e332b911fca54df467d264d7e2a2ef9317f3ca Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 6 Oct 2025 01:15:26 +0200 Subject: fs: add missing fences to I_NEW handling Suppose there are 2 CPUs racing inode hash lookup func (say ilookup5()) and unlock_new_inode(). In principle the latter can clear the I_NEW flag before prior stores into the inode were made visible. The former can in turn observe I_NEW is cleared and proceed to use the inode, while possibly reading from not-yet-published areas. Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- fs/dcache.c | 4 ++++ fs/inode.c | 8 ++++++++ include/linux/writeback.h | 4 ++++ 3 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index a067fa0a965a..806d6a665124 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1981,6 +1981,10 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) spin_lock(&inode->i_lock); __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); + /* + * Pairs with smp_rmb in wait_on_inode(). + */ + smp_wmb(); inode->i_state &= ~I_NEW & ~I_CREATING; /* * Pairs with the barrier in prepare_to_wait_event() to make sure diff --git a/fs/inode.c b/fs/inode.c index fa82cb810af4..37fc7a72aba5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1181,6 +1181,10 @@ void unlock_new_inode(struct inode *inode) lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); + /* + * Pairs with smp_rmb in wait_on_inode(). + */ + smp_wmb(); inode->i_state &= ~I_NEW & ~I_CREATING; /* * Pairs with the barrier in prepare_to_wait_event() to make sure @@ -1198,6 +1202,10 @@ void discard_new_inode(struct inode *inode) lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); + /* + * Pairs with smp_rmb in wait_on_inode(). + */ + smp_wmb(); inode->i_state &= ~I_NEW; /* * Pairs with the barrier in prepare_to_wait_event() to make sure diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 22dd4adc5667..e1e1231a6830 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -194,6 +194,10 @@ static inline void wait_on_inode(struct inode *inode) { wait_var_event(inode_state_wait_address(inode, __I_NEW), !(READ_ONCE(inode->i_state) & I_NEW)); + /* + * Pairs with routines clearing I_NEW. + */ + smp_rmb(); } #ifdef CONFIG_CGROUP_WRITEBACK -- cgit v1.2.3 From af6023e2ce0a3d4d948885d464b0ddca4b8b1fdf Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:15 +0200 Subject: fs: move wait_on_inode() from writeback.h to fs.h The only consumer outside of fs/inode.c is gfs2 and it already includes fs.h in the relevant file. Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 ++++++++++ include/linux/writeback.h | 11 ----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ac62b9d10b00..b35014ba681b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -949,6 +949,16 @@ static inline void inode_fake_hash(struct inode *inode) hlist_add_fake(&inode->i_hash); } +static inline void wait_on_inode(struct inode *inode) +{ + wait_var_event(inode_state_wait_address(inode, __I_NEW), + !(READ_ONCE(inode->i_state) & I_NEW)); + /* + * Pairs with routines clearing I_NEW. + */ + smp_rmb(); +} + /* * inode->i_rwsem nesting subclasses for the lock validator: * diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e1e1231a6830..06195c2a535b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,17 +189,6 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, void inode_wait_for_writeback(struct inode *inode); void inode_io_list_del(struct inode *inode); -/* writeback.h requires fs.h; it, too, is not included from here. */ -static inline void wait_on_inode(struct inode *inode) -{ - wait_var_event(inode_state_wait_address(inode, __I_NEW), - !(READ_ONCE(inode->i_state) & I_NEW)); - /* - * Pairs with routines clearing I_NEW. - */ - smp_rmb(); -} - #ifdef CONFIG_CGROUP_WRITEBACK #include -- cgit v1.2.3 From cb5db358ab5769cbd3e8e864f14af321126cccdb Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:16 +0200 Subject: fs: spell out fenced ->i_state accesses with explicit smp_wmb/smp_rmb The incomming helpers don't ship with _release/_acquire variants, for the time being anyway. Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 5 +++-- include/linux/backing-dev.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2b35e80037fe..9cda19a40ca2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -476,10 +476,11 @@ static bool inode_do_switch_wbs(struct inode *inode, switched = true; skip_switch: /* - * Paired with load_acquire in unlocked_inode_to_wb_begin() and + * Paired with an acquire fence in unlocked_inode_to_wb_begin() and * ensures that the new wb is visible if they see !I_WB_SWITCH. */ - smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); + smp_wmb(); + inode->i_state &= ~I_WB_SWITCH; xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3e64f14739dd..065cba5dc111 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -277,10 +277,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) rcu_read_lock(); /* - * Paired with store_release in inode_switch_wbs_work_fn() and + * Paired with a release fence in inode_do_switch_wbs() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + cookie->locked = inode->i_state & I_WB_SWITCH; + smp_rmb(); if (unlikely(cookie->locked)) xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); -- cgit v1.2.3 From d8753f788ab4916341d9fab81795be9f2f49c264 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:17 +0200 Subject: fs: provide accessors for ->i_state Open-coded accesses prevent asserting they are done correctly. One obvious aspect is locking, but significantly more can checked. For example it can be detected when the code is clearing flags which are already missing, or is setting flags when it is illegal (e.g., I_FREEING when ->i_count > 0). In order to keep things manageable this patchset merely gets the thing off the ground with only lockdep checks baked in. Current consumers can be trivially converted. Suppose flags I_A and I_B are to be handled. If ->i_lock is held, then: state = inode->i_state => state = inode_state_read(inode) inode->i_state |= (I_A | I_B) => inode_state_set(inode, I_A | I_B) inode->i_state &= ~(I_A | I_B) => inode_state_clear(inode, I_A | I_B) inode->i_state = I_A | I_B => inode_state_assign(inode, I_A | I_B) If ->i_lock is not held or only held conditionally: state = inode->i_state => state = inode_state_read_once(inode) inode->i_state |= (I_A | I_B) => inode_state_set_raw(inode, I_A | I_B) inode->i_state &= ~(I_A | I_B) => inode_state_clear_raw(inode, I_A | I_B) inode->i_state = I_A | I_B => inode_state_assign_raw(inode, I_A | I_B) The "_once" vs "_raw" discrepancy stems from the read variant differing by READ_ONCE as opposed to just lockdep checks. Finally, if you want to atomically clear flags and set new ones, the following: state = inode->i_state; state &= ~I_A; state |= I_B; inode->i_state = state; turns into: inode_state_replace(inode, I_A, I_B); Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/fs.h | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b35014ba681b..909eb1e68637 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -759,7 +759,7 @@ enum inode_state_bits { /* reserved wait address bit 3 */ }; -enum inode_state_flags_t { +enum inode_state_flags_enum { I_NEW = (1U << __I_NEW), I_SYNC = (1U << __I_SYNC), I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), @@ -843,7 +843,7 @@ struct inode { #endif /* Misc */ - enum inode_state_flags_t i_state; + enum inode_state_flags_enum i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -902,6 +902,80 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +/* + * i_state handling + * + * We hide all of it behind helpers so that we can validate consumers. + */ +static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode) +{ + return READ_ONCE(inode->i_state); +} + +static inline enum inode_state_flags_enum inode_state_read(struct inode *inode) +{ + lockdep_assert_held(&inode->i_lock); + return inode->i_state; +} + +static inline void inode_state_set_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state, inode->i_state | flags); +} + +static inline void inode_state_set(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_set_raw(inode, flags); +} + +static inline void inode_state_clear_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state, inode->i_state & ~flags); +} + +static inline void inode_state_clear(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_clear_raw(inode, flags); +} + +static inline void inode_state_assign_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state, flags); +} + +static inline void inode_state_assign(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_assign_raw(inode, flags); +} + +static inline void inode_state_replace_raw(struct inode *inode, + enum inode_state_flags_enum clearflags, + enum inode_state_flags_enum setflags) +{ + enum inode_state_flags_enum flags; + flags = inode->i_state; + flags &= ~clearflags; + flags |= setflags; + inode_state_assign_raw(inode, flags); +} + +static inline void inode_state_replace(struct inode *inode, + enum inode_state_flags_enum clearflags, + enum inode_state_flags_enum setflags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_replace_raw(inode, clearflags, setflags); +} + static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { VFS_WARN_ON_INODE(strlen(link) != linklen, inode); -- cgit v1.2.3 From f5aa78e2be066f3801785094f1b55a3114fe461a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:19 +0200 Subject: Manual conversion to use ->i_state accessors of all places not covered by coccinelle Nothing to look at apart from iput_final(). Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 2 +- fs/afs/inode.c | 2 +- fs/ext4/inode.c | 10 +++++----- fs/ext4/orphan.c | 4 ++-- fs/inode.c | 18 ++++++++---------- include/linux/backing-dev.h | 2 +- include/linux/fs.h | 6 +++--- include/linux/writeback.h | 2 +- include/trace/events/writeback.h | 8 ++++---- 9 files changed, 26 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 7233b04668fc..35f027981b21 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -211,7 +211,7 @@ test and set for you. e.g.:: inode = iget_locked(sb, ino); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { err = read_inode_from_disk(inode); if (err < 0) { iget_failed(inode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 2fe2ccf59c7a..dde1857fcabb 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -427,7 +427,7 @@ static void afs_fetch_status_success(struct afs_operation *op) struct afs_vnode *vnode = vp->vnode; int ret; - if (vnode->netfs.inode.i_state & I_NEW) { + if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); afs_op_set_error(op, ret); if (ret == 0) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9e4ac87211e..b864e9645f85 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -425,7 +425,7 @@ void ext4_check_map_extents_env(struct inode *inode) if (!S_ISREG(inode->i_mode) || IS_NOQUOTA(inode) || IS_VERITY(inode) || is_special_ino(inode->i_sb, inode->i_ino) || - (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || ext4_verity_in_progress(inode)) return; @@ -3473,7 +3473,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; - return inode->i_state & I_DIRTY_DATASYNC; + return inode_state_read_once(inode) & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, @@ -4552,7 +4552,7 @@ int ext4_truncate(struct inode *inode) * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ - if (!(inode->i_state & (I_NEW|I_FREEING))) + if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); @@ -5210,7 +5210,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { ret = check_igot_inode(inode, flags, function, line); if (ret) { iput(inode); @@ -5541,7 +5541,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb, if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 33c3a89396b1..c4903d98ff81 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -107,7 +107,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!sbi->s_journal || is_bad_inode(inode)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_inode_orphan_tracked(inode)) return 0; @@ -232,7 +232,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); diff --git a/fs/inode.c b/fs/inode.c index f094ed3e6f30..3153d725859c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -829,7 +829,7 @@ static void evict(struct inode *inode) * This also means we don't need any fences for the call below. */ inode_wake_up_bit(inode, __I_NEW); - BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + BUG_ON(inode_state_read_once(inode) != (I_FREEING | I_CLEAR)); destroy_inode(inode); } @@ -1883,7 +1883,6 @@ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; - unsigned long state; int drop; WARN_ON(inode_state_read(inode) & I_NEW); @@ -1908,20 +1907,19 @@ static void iput_final(struct inode *inode) */ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); - state = inode_state_read(inode); - if (!drop) { - WRITE_ONCE(inode->i_state, state | I_WILL_FREE); + if (drop) { + inode_state_set(inode, I_FREEING); + } else { + inode_state_set(inode, I_WILL_FREE); spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); - state = inode_state_read(inode); - WARN_ON(state & I_NEW); - state &= ~I_WILL_FREE; + WARN_ON(inode_state_read(inode) & I_NEW); + inode_state_replace(inode, I_WILL_FREE, I_FREEING); } - WRITE_ONCE(inode->i_state, state | I_FREEING); if (!list_empty(&inode->i_lru)) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); @@ -2985,7 +2983,7 @@ void dump_inode(struct inode *inode, const char *reason) pr_warn("%s encountered for inode %px\n" "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, - inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); + inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 065cba5dc111..0c8342747cab 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -280,7 +280,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) * Paired with a release fence in inode_do_switch_wbs() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - cookie->locked = inode->i_state & I_WB_SWITCH; + cookie->locked = inode_state_read_once(inode) & I_WB_SWITCH; smp_rmb(); if (unlikely(cookie->locked)) diff --git a/include/linux/fs.h b/include/linux/fs.h index 909eb1e68637..77b6486dcae7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1026,7 +1026,7 @@ static inline void inode_fake_hash(struct inode *inode) static inline void wait_on_inode(struct inode *inode) { wait_var_event(inode_state_wait_address(inode, __I_NEW), - !(READ_ONCE(inode->i_state) & I_NEW)); + !(inode_state_read_once(inode) & I_NEW)); /* * Pairs with routines clearing I_NEW. */ @@ -2719,8 +2719,8 @@ static inline int icount_read(const struct inode *inode) */ static inline bool inode_is_dirtytime_only(struct inode *inode) { - return (inode->i_state & (I_DIRTY_TIME | I_NEW | - I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; + return (inode_state_read_once(inode) & + (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; } extern void inc_nlink(struct inode *inode); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 06195c2a535b..102071ffedcb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -227,7 +227,7 @@ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { - WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); + WARN_ON_ONCE(!(inode_state_read_once(inode) & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index c08aff044e80..311a341e6fe4 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->flags = flags; ), @@ -748,7 +748,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue, strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), @@ -787,7 +787,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; @@ -839,7 +839,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), -- cgit v1.2.3 From 2ed81b4bef9b74ae0f095ad4667dbe2ae0b86a91 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:28 +0200 Subject: fs: make plain ->i_state access fail to compile ... to make sure all accesses are properly validated. Merely renaming the var to __i_state still lets the compiler make the following suggestion: error: 'struct inode' has no member named 'i_state'; did you mean '__i_state'? Unfortunately some people will add the __'s and call it a day. In order to make it harder to mess up in this way, hide it behind a struct. The resulting error message should be convincing in terms of checking what to do: error: invalid operands to binary & (have 'struct inode_state_flags' and 'int') Of course people determined to do a plain access can still do it, but nothing can be done for that case. Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/fs.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 77b6486dcae7..21c73df3ce75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -785,6 +785,13 @@ enum inode_state_flags_enum { #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) +/* + * Use inode_state_read() & friends to access. + */ +struct inode_state_flags { + enum inode_state_flags_enum __state; +}; + /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -843,7 +850,7 @@ struct inode { #endif /* Misc */ - enum inode_state_flags_enum i_state; + struct inode_state_flags i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -909,19 +916,19 @@ struct inode { */ static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode) { - return READ_ONCE(inode->i_state); + return READ_ONCE(inode->i_state.__state); } static inline enum inode_state_flags_enum inode_state_read(struct inode *inode) { lockdep_assert_held(&inode->i_lock); - return inode->i_state; + return inode->i_state.__state; } static inline void inode_state_set_raw(struct inode *inode, enum inode_state_flags_enum flags) { - WRITE_ONCE(inode->i_state, inode->i_state | flags); + WRITE_ONCE(inode->i_state.__state, inode->i_state.__state | flags); } static inline void inode_state_set(struct inode *inode, @@ -934,7 +941,7 @@ static inline void inode_state_set(struct inode *inode, static inline void inode_state_clear_raw(struct inode *inode, enum inode_state_flags_enum flags) { - WRITE_ONCE(inode->i_state, inode->i_state & ~flags); + WRITE_ONCE(inode->i_state.__state, inode->i_state.__state & ~flags); } static inline void inode_state_clear(struct inode *inode, @@ -947,7 +954,7 @@ static inline void inode_state_clear(struct inode *inode, static inline void inode_state_assign_raw(struct inode *inode, enum inode_state_flags_enum flags) { - WRITE_ONCE(inode->i_state, flags); + WRITE_ONCE(inode->i_state.__state, flags); } static inline void inode_state_assign(struct inode *inode, @@ -962,7 +969,7 @@ static inline void inode_state_replace_raw(struct inode *inode, enum inode_state_flags_enum setflags) { enum inode_state_flags_enum flags; - flags = inode->i_state; + flags = inode->i_state.__state; flags &= ~clearflags; flags |= setflags; inode_state_assign_raw(inode, flags); -- cgit v1.2.3 From 1888635532fbbd6be4a4368621085c3a197279f8 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Tue, 30 Sep 2025 16:53:15 +0800 Subject: writeback: Wake up waiting tasks when finishing the writeback of a chunk. Writing back a large number of pages can take a lots of time. This issue is exacerbated when the underlying device is slow or subject to block layer rate limiting, which in turn triggers unexpected hung task warnings. We can trigger a wake-up once a chunk has been written back and the waiting time for writeback exceeds half of sysctl_hung_task_timeout_secs. This action allows the hung task detector to be aware of the writeback progress, thereby eliminating these unexpected hung task warnings. This patch has passed the xfstests 'check -g quick' test based on ext4, with no additional failures introduced. Signed-off-by: Julian Sun Reviewed-by: Jan Kara Suggested-by: Peter Zijlstra Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 10 +++++++++- include/linux/backing-dev-defs.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2b35e80037fe..61a980a06cee 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -14,6 +14,7 @@ * Additions for address_space-based writeback */ +#include #include #include #include @@ -213,7 +214,8 @@ static void wb_queue_work(struct bdi_writeback *wb, void wb_wait_for_completion(struct wb_completion *done) { atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(*done->waitq, !atomic_read(&done->cnt)); + wait_event(*done->waitq, + ({ done->progress_stamp = jiffies; !atomic_read(&done->cnt); })); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -2014,6 +2016,12 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + /* Report progress to inform the hung task detector of the progress. */ + if (work->done && work->done->progress_stamp && + (jiffies - work->done->progress_stamp) > HZ * + sysctl_hung_task_timeout_secs / 2) + wake_up_all(work->done->waitq); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c5c9d89c73ed..c8aa749790b1 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -63,6 +63,7 @@ enum wb_reason { struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; + unsigned long progress_stamp; /* The jiffies when slow progress is detected */ }; #define __WB_COMPLETION_INIT(_waitq) \ -- cgit v1.2.3 From d6e6215907640801b1f407dc9e871b19ca5a3805 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Tue, 30 Sep 2025 15:18:29 +0800 Subject: writeback: Add logging for slow writeback (exceeds sysctl_hung_task_timeout_secs) When a writeback work lasts for sysctl_hung_task_timeout_secs, we want to identify that there are tasks waiting for a long time-this helps us pinpoint potential issues. Additionally, recording the starting jiffies is useful when debugging a crashed vmcore. Signed-off-by: Julian Sun Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 17 +++++++++++++++-- include/linux/backing-dev-defs.h | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 61a980a06cee..e76192d140e3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -201,6 +201,19 @@ static void wb_queue_work(struct bdi_writeback *wb, spin_unlock_irq(&wb->work_lock); } +static bool wb_wait_for_completion_cb(struct wb_completion *done) +{ + unsigned long waited_secs = (jiffies - done->wait_start) / HZ; + + done->progress_stamp = jiffies; + if (waited_secs > sysctl_hung_task_timeout_secs) + pr_info("INFO: The task %s:%d has been waiting for writeback " + "completion for more than %lu seconds.", + current->comm, current->pid, waited_secs); + + return !atomic_read(&done->cnt); +} + /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @done: target wb_completion @@ -213,9 +226,9 @@ static void wb_queue_work(struct bdi_writeback *wb, */ void wb_wait_for_completion(struct wb_completion *done) { + done->wait_start = jiffies; atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(*done->waitq, - ({ done->progress_stamp = jiffies; !atomic_read(&done->cnt); })); + wait_event(*done->waitq, wb_wait_for_completion_cb(done)); } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c8aa749790b1..610ef62b6a32 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -64,6 +64,7 @@ struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; unsigned long progress_stamp; /* The jiffies when slow progress is detected */ + unsigned long wait_start; /* The jiffies when waiting for the writeback work to finish */ }; #define __WB_COMPLETION_INIT(_waitq) \ -- cgit v1.2.3 From a00f3dea0352a5fb0b67b84c72daeb6563f8e67f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 9 Oct 2025 21:25:56 +0200 Subject: ACPI: PM: s2idle: Drop acpi_get_lps0_constraint() Drop unused function acpi_get_lps0_constraint(). No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/5032801.GXAFRqVoOG@rafael.j.wysocki --- drivers/acpi/x86/s2idle.c | 24 ------------------------ include/linux/acpi.h | 5 ----- 2 files changed, 29 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c index dd0b40b9bbe8..ea645853fc20 100644 --- a/drivers/acpi/x86/s2idle.c +++ b/drivers/acpi/x86/s2idle.c @@ -299,30 +299,6 @@ free_acpi_buffer: ACPI_FREE(out_obj); } -/** - * acpi_get_lps0_constraint - Get the LPS0 constraint for a device. - * @adev: Device to get the constraint for. - * - * The LPS0 constraint is the shallowest (minimum) power state in which the - * device can be so as to allow the platform as a whole to achieve additional - * energy conservation by utilizing a system-wide low-power state. - * - * Returns: - * - ACPI power state value of the constraint for @adev on success. - * - Otherwise, ACPI_STATE_UNKNOWN. - */ -int acpi_get_lps0_constraint(struct acpi_device *adev) -{ - struct lpi_constraints *entry; - - for_each_lpi_constraint(entry) { - if (adev->handle == entry->handle) - return entry->min_dstate; - } - - return ACPI_STATE_UNKNOWN; -} - static void lpi_check_constraints(void) { struct lpi_constraints *entry; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead..252768d007c7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1146,12 +1146,7 @@ struct acpi_s2idle_dev_ops { #if defined(CONFIG_SUSPEND) && defined(CONFIG_X86) int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg); void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg); -int acpi_get_lps0_constraint(struct acpi_device *adev); #else /* CONFIG_SUSPEND && CONFIG_X86 */ -static inline int acpi_get_lps0_constraint(struct device *dev) -{ - return ACPI_STATE_UNKNOWN; -} static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg) { return -ENODEV; -- cgit v1.2.3 From 813882ae22756bcf9645d405e045c60e5aab0a93 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 16 Oct 2025 15:36:46 +0100 Subject: net: stmmac: remove broken PCS code Changing the netif_carrier_*() state behind phylink's back has always been prohibited because it messes up with phylinks state tracking, and means that phylink no longer guarantees to call the mac_link_down() and mac_link_up() methods at the appropriate times. This was later documented in the sfp-phylink network driver conversion guide. stmmac was converted to phylink in 2019, but nothing was done with the "PCS" code. Since then, apart from the updates as part of phylink development, nothing has happened with stmmac to improve its use of phylink, or even to address this point. A couple of years ago, a has_integrated_pcs boolean was added by Bart, which later became the STMMAC_FLAG_HAS_INTEGRATED_PCS flag, to avoid manipulating the netif_carrier_*() state. This flag is mis-named, because whenever the stmmac is synthesized for its native SGMII, TBI or RTBI interfaces, it has an "integrated PCS". This boolean/flag actually means "ignore the status from the integrated PCS". Discussing with Bart, the reasons for this are lost to the winds of time (which is why we should always document the reasons in the commit message.) RGMII also has in-band status, and the dwmac cores and stmmac code supports this but with one bug that saves the day. When dwmac cores are synthesised for RGMII only, they do not contain an integrated PCS, and so priv->dma_cap.pcs is clear, which prevents (incorrectly) the "RGMII PCS" being used, meaning we don't read the in-band status. However, a core synthesised for RGMII and also SGMII, TBI or RTBI will have this capability bit set, thus making these code paths reachable. The Jetson Xavier NX uses RGMII mode to talk to its PHY, and removing the incorrect check for priv->dma_cap.pcs reveals the theortical issue with netif_carrier_*() manipulation is real: dwc-eth-dwmac 2490000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0 dwc-eth-dwmac 2490000.ethernet eth0: PHY [stmmac-0:00] driver [RTL8211F Gigabit Ethernet] (irq=141) dwc-eth-dwmac 2490000.ethernet eth0: No Safety Features support found dwc-eth-dwmac 2490000.ethernet eth0: IEEE 1588-2008 Advanced Timestamp supported dwc-eth-dwmac 2490000.ethernet eth0: registered PTP clock dwc-eth-dwmac 2490000.ethernet eth0: configuring for phy/rgmii-id link mode 8021q: adding VLAN 0 to HW filter on device eth0 dwc-eth-dwmac 2490000.ethernet eth0: Adding VLAN ID 0 is not supported Link is Up - 1000/Full Link is Down Link is Up - 1000/Full This looks good until one realises that the phylink "Link" status messages are missing, even when the RJ45 cable is reconnected. Nothing one can do results in the interface working. The interrupt handler (which prints those "Link is" messages) always wins over phylink's resolve worker, meaning phylink never calls the mac_link_up() nor mac_link_down() methods. eth0 also sees no traffic received, and is unable to obtain a DHCP address: 3: eth0: mtu 1500 qdisc mq state UP group defa ult qlen 1000 link/ether e6:d3:6a:e6:92:de brd ff:ff:ff:ff:ff:ff RX: bytes packets errors dropped overrun mcast 0 0 0 0 0 0 TX: bytes packets errors dropped carrier collsns 27686 149 0 0 0 0 With the STMMAC_FLAG_HAS_INTEGRATED_PCS flag set, which disables the netif_carrier_*() manipulation then stmmac works normally: dwc-eth-dwmac 2490000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0 dwc-eth-dwmac 2490000.ethernet eth0: PHY [stmmac-0:00] driver [RTL8211F Gigabit Ethernet] (irq=141) dwc-eth-dwmac 2490000.ethernet eth0: No Safety Features support found dwc-eth-dwmac 2490000.ethernet eth0: IEEE 1588-2008 Advanced Timestamp supported dwc-eth-dwmac 2490000.ethernet eth0: registered PTP clock dwc-eth-dwmac 2490000.ethernet eth0: configuring for phy/rgmii-id link mode 8021q: adding VLAN 0 to HW filter on device eth0 dwc-eth-dwmac 2490000.ethernet eth0: Adding VLAN ID 0 is not supported Link is Up - 1000/Full dwc-eth-dwmac 2490000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx and packets can be transferred. This clearly shows that when priv->hw->pcs is set, but STMMAC_FLAG_HAS_INTEGRATED_PCS is clear, the driver reliably fails. Discovering whether a platform falls into this is impossible as parsing all the dtsi and dts files to find out which use the stmmac driver, whether any of them use RGMII or SGMII and also depends whether an external interface is being used. The kernel likely doesn't contain all dts files either. The only driver that sets this flag uses the qcom,sa8775p-ethqos compatible, and uses SGMII or 2500BASE-X. but these are saved from this problem by the incorrect check for priv->dma_cap.pcs. So, we have to assume that for every other platform that uses SGMII with stmmac is using an external PCS. Moreover, ethtool output can be incorrect. With the full-duplex link negotiated, ethtool reports: Speed: 1000Mb/s Duplex: Half because with dwmac4, the full-duplex bit is in bit 16 of the status, priv->xstats.pcs_duplex becomes BIT(16) for full duplex, but the ethtool ksettings duplex member is u8 - so becomes zero. Moreover, the supported, advertised and link partner modes are all "not reported". Finally, ksettings_set() won't be able to set the advertisement on a PHY if this PCS code is activated, which is incorrect when SGMII is used with a PHY. Thus, remove: 1. the incorrect netif_carrier_*() manipulation. 2. the broken ethtool ksettings code. Given that all uses of STMMAC_FLAG_HAS_INTEGRATED_PCS are now gone, remove the flag from stmmac.h and dwmac-qcom-ethqos.c. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Tested-by: Maxime Chevallier Tested-by: Lad Prabhakar Link: https://patch.msgid.link/E1v9P5y-0000000AolC-1QWH@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 4 -- .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 55 ---------------------- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 9 ---- include/linux/stmmac.h | 1 - 4 files changed, 69 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index d8fd4d8f6ced..f62825220cf7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -96,7 +96,6 @@ struct ethqos_emac_driver_data { bool rgmii_config_loopback_en; bool has_emac_ge_3; const char *link_clk_name; - bool has_integrated_pcs; u32 dma_addr_width; struct dwmac4_addrs dwmac4_addrs; bool needs_sgmii_loopback; @@ -282,7 +281,6 @@ static const struct ethqos_emac_driver_data emac_v4_0_0_data = { .rgmii_config_loopback_en = false, .has_emac_ge_3 = true, .link_clk_name = "phyaux", - .has_integrated_pcs = true, .needs_sgmii_loopback = true, .dma_addr_width = 36, .dwmac4_addrs = { @@ -856,8 +854,6 @@ static int qcom_ethqos_probe(struct platform_device *pdev) plat_dat->flags |= STMMAC_FLAG_TSO_EN; if (of_device_is_compatible(np, "qcom,qcs404-ethqos")) plat_dat->flags |= STMMAC_FLAG_RX_CLK_RUNS_IN_LPI; - if (data->has_integrated_pcs) - plat_dat->flags |= STMMAC_FLAG_HAS_INTEGRATED_PCS; if (data->dma_addr_width) plat_dat->host_dma_width = data->dma_addr_width; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 39fa1ec92f82..d89662b48087 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -322,47 +322,6 @@ static int stmmac_ethtool_get_link_ksettings(struct net_device *dev, { struct stmmac_priv *priv = netdev_priv(dev); - if (!(priv->plat->flags & STMMAC_FLAG_HAS_INTEGRATED_PCS) && - (priv->hw->pcs & STMMAC_PCS_RGMII || - priv->hw->pcs & STMMAC_PCS_SGMII)) { - u32 supported, advertising, lp_advertising; - - if (!priv->xstats.pcs_link) { - cmd->base.speed = SPEED_UNKNOWN; - cmd->base.duplex = DUPLEX_UNKNOWN; - return 0; - } - cmd->base.duplex = priv->xstats.pcs_duplex; - - cmd->base.speed = priv->xstats.pcs_speed; - - /* Encoding of PSE bits is defined in 802.3z, 37.2.1.4 */ - - ethtool_convert_link_mode_to_legacy_u32( - &supported, cmd->link_modes.supported); - ethtool_convert_link_mode_to_legacy_u32( - &advertising, cmd->link_modes.advertising); - ethtool_convert_link_mode_to_legacy_u32( - &lp_advertising, cmd->link_modes.lp_advertising); - - /* Reg49[3] always set because ANE is always supported */ - cmd->base.autoneg = ADVERTISED_Autoneg; - supported |= SUPPORTED_Autoneg; - advertising |= ADVERTISED_Autoneg; - lp_advertising |= ADVERTISED_Autoneg; - - cmd->base.port = PORT_OTHER; - - ethtool_convert_legacy_u32_to_link_mode( - cmd->link_modes.supported, supported); - ethtool_convert_legacy_u32_to_link_mode( - cmd->link_modes.advertising, advertising); - ethtool_convert_legacy_u32_to_link_mode( - cmd->link_modes.lp_advertising, lp_advertising); - - return 0; - } - return phylink_ethtool_ksettings_get(priv->phylink, cmd); } @@ -372,20 +331,6 @@ stmmac_ethtool_set_link_ksettings(struct net_device *dev, { struct stmmac_priv *priv = netdev_priv(dev); - if (!(priv->plat->flags & STMMAC_FLAG_HAS_INTEGRATED_PCS) && - (priv->hw->pcs & STMMAC_PCS_RGMII || - priv->hw->pcs & STMMAC_PCS_SGMII)) { - /* Only support ANE */ - if (cmd->base.autoneg != AUTONEG_ENABLE) - return -EINVAL; - - mutex_lock(&priv->lock); - stmmac_pcs_ctrl_ane(priv, 1, priv->hw->ps, 0); - mutex_unlock(&priv->lock); - - return 0; - } - return phylink_ethtool_ksettings_set(priv->phylink, cmd); } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index c9fa965c8566..867d0ca3b45e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -6001,15 +6001,6 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv) for (queue = 0; queue < queues_count; queue++) stmmac_host_mtl_irq_status(priv, priv->hw, queue); - /* PCS link status */ - if (priv->hw->pcs && - !(priv->plat->flags & STMMAC_FLAG_HAS_INTEGRATED_PCS)) { - if (priv->xstats.pcs_link) - netif_carrier_on(priv->dev); - else - netif_carrier_off(priv->dev); - } - stmmac_timestamp_interrupt(priv, priv); } } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index fa1318bac06c..99022620457a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -171,7 +171,6 @@ struct dwmac4_addrs { u32 mtl_low_cred_offset; }; -#define STMMAC_FLAG_HAS_INTEGRATED_PCS BIT(0) #define STMMAC_FLAG_SPH_DISABLE BIT(1) #define STMMAC_FLAG_USE_PHY_WOL BIT(2) #define STMMAC_FLAG_HAS_SUN8I BIT(3) -- cgit v1.2.3 From d19f6451c6feefd6537b97efa5f3859681f243cb Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 16 Oct 2025 11:09:26 +0200 Subject: gpio: export gpiod_hwgpio() Reading the GPIO hardware number from a descriptor is a valid use-case outside of the GPIO core. Export the symbol to consumers of GPIO descriptors. Reviewed-by: Linus Walleij Reviewed-by: Andrew Jeffery Link: https://lore.kernel.org/r/20251016-aspeed-gpiolib-include-v1-2-31201c06d124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 13 +++++++++++++ drivers/gpio/gpiolib.h | 8 -------- include/linux/gpio/consumer.h | 2 ++ 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 5a450dac8f3a..a81981336b36 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -235,6 +235,19 @@ int desc_to_gpio(const struct gpio_desc *desc) } EXPORT_SYMBOL_GPL(desc_to_gpio); +/** + * gpiod_hwgpio - Return the GPIO number of the passed descriptor relative to + * its chip. + * @desc: GPIO descriptor + * + * Returns: + * Hardware offset of the GPIO represented by the descriptor. + */ +int gpiod_hwgpio(const struct gpio_desc *desc) +{ + return desc - &desc->gdev->descs[0]; +} +EXPORT_SYMBOL_GPL(gpiod_hwgpio); /** * gpiod_to_chip - Return the GPIO chip to which a GPIO descriptor belongs diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index 62d4c15b74f5..14e6a9807a89 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -273,14 +273,6 @@ int gpiochip_get_ngpios(struct gpio_chip *gc, struct device *dev); struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum); const char *gpiod_get_label(struct gpio_desc *desc); -/* - * Return the GPIO number of the passed descriptor relative to its chip - */ -static inline int gpiod_hwgpio(const struct gpio_desc *desc) -{ - return desc - &desc->gdev->descs[0]; -} - /* With descriptor prefix */ #define __gpiod_pr(level, desc, fmt, ...) \ diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 00df68c51405..994d46874d56 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -171,6 +171,8 @@ int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name); struct gpio_desc *gpio_to_desc(unsigned gpio); int desc_to_gpio(const struct gpio_desc *desc); +int gpiod_hwgpio(const struct gpio_desc *desc); + struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode, const char *con_id, int index, enum gpiod_flags flags, -- cgit v1.2.3 From 44472d1b83127e579c798ff92a07ae86d98b61b9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 6 Oct 2025 13:07:32 +0200 Subject: atomic: Skip alignment check for try_cmpxchg() old arg The 'old' argument in atomic_try_cmpxchg() and related functions is a pointer to a normal non-atomic integer number, which does not require to be naturally aligned, unlike the atomic_t/atomic64_t types themselves. In order to add an alignment check with CONFIG_DEBUG_ATOMIC into the normal instrument_atomic_read_write() helper, change this check to use the non-atomic instrument_read_write(), the same way that was done earlier for try_cmpxchg() in commit ec570320b09f ("locking/atomic: Correct (cmp)xchg() instrumentation"). This prevents warnings on m68k calling the 32-bit atomic_try_cmpxchg() with 16-bit aligned arguments as well as several more architectures including x86-32 when calling atomic64_try_cmpxchg() with 32-bit aligned u64 arguments. Reported-by: Finn Thain Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/cover.1757810729.git.fthain@linux-m68k.org/ --- include/linux/atomic/atomic-instrumented.h | 26 +++++++++++++------------- scripts/atomic/gen-atomic-instrumented.sh | 11 +++++++---- 2 files changed, 20 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index 9409a6ddf3e0..37ab6314a9f7 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -1276,7 +1276,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg(v, old, new); } @@ -1298,7 +1298,7 @@ static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_acquire(v, old, new); } @@ -1321,7 +1321,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_release(v, old, new); } @@ -1343,7 +1343,7 @@ static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_relaxed(v, old, new); } @@ -2854,7 +2854,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg(v, old, new); } @@ -2876,7 +2876,7 @@ static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_acquire(v, old, new); } @@ -2899,7 +2899,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_release(v, old, new); } @@ -2921,7 +2921,7 @@ static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_relaxed(v, old, new); } @@ -4432,7 +4432,7 @@ atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg(v, old, new); } @@ -4454,7 +4454,7 @@ static __always_inline bool atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_acquire(v, old, new); } @@ -4477,7 +4477,7 @@ atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_release(v, old, new); } @@ -4499,7 +4499,7 @@ static __always_inline bool atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_relaxed(v, old, new); } @@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 8829b337928e9508259079d32581775ececd415b +// f618ac667f868941a84ce0ab2242f1786e049ed4 diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh index 592f3ec89b5f..9c1d53f81eb2 100755 --- a/scripts/atomic/gen-atomic-instrumented.sh +++ b/scripts/atomic/gen-atomic-instrumented.sh @@ -12,7 +12,7 @@ gen_param_check() local arg="$1"; shift local type="${arg%%:*}" local name="$(gen_param_name "${arg}")" - local rw="write" + local rw="atomic_write" case "${type#c}" in i) return;; @@ -20,14 +20,17 @@ gen_param_check() if [ ${type#c} != ${type} ]; then # We don't write to constant parameters. - rw="read" + rw="atomic_read" + elif [ "${type}" = "p" ] ; then + # The "old" argument in try_cmpxchg() gets accessed non-atomically + rw="read_write" elif [ "${meta}" != "s" ]; then # An atomic RMW: if this parameter is not a constant, and this atomic is # not just a 's'tore, this parameter is both read from and written to. - rw="read_write" + rw="atomic_read_write" fi - printf "\tinstrument_atomic_${rw}(${name}, sizeof(*${name}));\n" + printf "\tinstrument_${rw}(${name}, sizeof(*${name}));\n" } #gen_params_checks(meta, arg...) -- cgit v1.2.3 From cc39f3872c0865bef992b713338df369554fa9e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Oct 2025 22:11:54 +0200 Subject: seqlock: Introduce scoped_seqlock_read() The read_seqbegin/need_seqretry/done_seqretry API is cumbersome and error prone. With the new helper the "typical" code like int seq, nextseq; unsigned long flags; nextseq = 0; do { seq = nextseq; flags = read_seqbegin_or_lock_irqsave(&seqlock, &seq); // read-side critical section nextseq = 1; } while (need_seqretry(&seqlock, seq)); done_seqretry_irqrestore(&seqlock, seq, flags); can be rewritten as scoped_seqlock_read (&seqlock, ss_lock_irqsave) { // read-side critical section } Original idea by Oleg Nesterov; with contributions from Linus. Originally-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) --- include/linux/seqlock.h | 111 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5ce48eab7a2a..b7bcc4111e90 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -1209,4 +1209,115 @@ done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) if (seq & 1) read_sequnlock_excl_irqrestore(lock, flags); } + +enum ss_state { + ss_done = 0, + ss_lock, + ss_lock_irqsave, + ss_lockless, +}; + +struct ss_tmp { + enum ss_state state; + unsigned long data; + spinlock_t *lock; + spinlock_t *lock_irqsave; +}; + +static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst) +{ + if (sst->lock) + spin_unlock(sst->lock); + if (sst->lock_irqsave) + spin_unlock_irqrestore(sst->lock_irqsave, sst->data); +} + +extern void __scoped_seqlock_invalid_target(void); + +#if defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000 +/* + * For some reason some GCC-8 architectures (nios2, alpha) have trouble + * determining that the ss_done state is impossible in __scoped_seqlock_next() + * below. + */ +static inline void __scoped_seqlock_bug(void) { } +#else +/* + * Canary for compiler optimization -- if the compiler doesn't realize this is + * an impossible state, it very likely generates sub-optimal code here. + */ +extern void __scoped_seqlock_bug(void); +#endif + +static inline void +__scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target) +{ + switch (sst->state) { + case ss_done: + __scoped_seqlock_bug(); + return; + + case ss_lock: + case ss_lock_irqsave: + sst->state = ss_done; + return; + + case ss_lockless: + if (!read_seqretry(lock, sst->data)) { + sst->state = ss_done; + return; + } + break; + } + + switch (target) { + case ss_done: + __scoped_seqlock_invalid_target(); + return; + + case ss_lock: + sst->lock = &lock->lock; + spin_lock(sst->lock); + sst->state = ss_lock; + return; + + case ss_lock_irqsave: + sst->lock_irqsave = &lock->lock; + spin_lock_irqsave(sst->lock_irqsave, sst->data); + sst->state = ss_lock_irqsave; + return; + + case ss_lockless: + sst->data = read_seqbegin(lock); + return; + } +} + +#define __scoped_seqlock_read(_seqlock, _target, _s) \ + for (struct ss_tmp _s __cleanup(__scoped_seqlock_cleanup) = \ + { .state = ss_lockless, .data = read_seqbegin(_seqlock) }; \ + _s.state != ss_done; \ + __scoped_seqlock_next(&_s, _seqlock, _target)) + +/** + * scoped_seqlock_read (lock, ss_state) - execute the read side critical + * section without manual sequence + * counter handling or calls to other + * helpers + * @lock: pointer to seqlock_t protecting the data + * @ss_state: one of {ss_lock, ss_lock_irqsave, ss_lockless} indicating + * the type of critical read section + * + * Example: + * + * scoped_seqlock_read (&lock, ss_lock) { + * // read-side critical section + * } + * + * Starts with a lockess pass first. If it fails, restarts the critical + * section with the lock held. + */ +#define scoped_seqlock_read(_seqlock, _target) \ + __scoped_seqlock_read(_seqlock, _target, __UNIQUE_ID(seqlock)) + #endif /* __LINUX_SEQLOCK_H */ -- cgit v1.2.3 From ebaec90ec0b5850ab80ca017e7b63183adcca131 Mon Sep 17 00:00:00 2001 From: Samuel Kayode Date: Wed, 1 Oct 2025 11:42:38 -0400 Subject: mfd: pf1550: Add core driver for the PF1550 PMIC There are 3 sub-devices for which the drivers will be added in subsequent patches. Signed-off-by: Samuel Kayode Reviewed-by: Frank Li Tested-by: Sean Nyekjaer Link: https://patch.msgid.link/20251001-pf1550-v12-2-a3302aa41687@savoirfairelinux.com Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 16 ++ drivers/mfd/Makefile | 2 + drivers/mfd/pf1550.c | 367 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/pf1550.h | 273 +++++++++++++++++++++++++++++++++ 4 files changed, 658 insertions(+) create mode 100644 drivers/mfd/pf1550.c create mode 100644 include/linux/mfd/pf1550.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 6cec1858947b..219ee6ddf516 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -605,6 +605,22 @@ config MFD_MX25_TSADC i.MX25 processors. They consist of a conversion queue for general purpose ADC and a queue for Touchscreens. +config MFD_PF1550 + tristate "NXP PF1550 PMIC Support" + depends on I2C=y && OF + select MFD_CORE + select REGMAP_I2C + select REGMAP_IRQ + help + Say yes here to add support for NXP PF1550. This is a companion Power + Management IC with regulators, onkey, and charger control on chip. + This driver provides common support for accessing the device; + additional drivers must be enabled in order to use the functionality + of the device. + + This driver can also be built as a module and if so will be called + pf1550. + config MFD_HI6421_PMIC tristate "HiSilicon Hi6421 PMU/Codec IC" depends on OF diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 865e9f12faff..566952f191b5 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -122,6 +122,8 @@ obj-$(CONFIG_MFD_MC13XXX) += mc13xxx-core.o obj-$(CONFIG_MFD_MC13XXX_SPI) += mc13xxx-spi.o obj-$(CONFIG_MFD_MC13XXX_I2C) += mc13xxx-i2c.o +obj-$(CONFIG_MFD_PF1550) += pf1550.o + obj-$(CONFIG_MFD_NCT6694) += nct6694.o obj-$(CONFIG_MFD_CORE) += mfd-core.o diff --git a/drivers/mfd/pf1550.c b/drivers/mfd/pf1550.c new file mode 100644 index 000000000000..c4f567c05564 --- /dev/null +++ b/drivers/mfd/pf1550.c @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Core driver for the PF1550 + * + * Copyright (C) 2016 Freescale Semiconductor, Inc. + * Robin Gong + * + * Portions Copyright (c) 2025 Savoir-faire Linux Inc. + * Samuel Kayode + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct regmap_config pf1550_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = PF1550_PMIC_REG_END, +}; + +static const struct regmap_irq pf1550_irqs[] = { + REGMAP_IRQ_REG(PF1550_IRQ_CHG, 0, IRQ_CHG), + REGMAP_IRQ_REG(PF1550_IRQ_REGULATOR, 0, IRQ_REGULATOR), + REGMAP_IRQ_REG(PF1550_IRQ_ONKEY, 0, IRQ_ONKEY), +}; + +static const struct regmap_irq_chip pf1550_irq_chip = { + .name = "pf1550", + .status_base = PF1550_PMIC_REG_INT_CATEGORY, + .init_ack_masked = 1, + .num_regs = 1, + .irqs = pf1550_irqs, + .num_irqs = ARRAY_SIZE(pf1550_irqs), +}; + +static const struct regmap_irq pf1550_regulator_irqs[] = { + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW1_LS, 0, PMIC_IRQ_SW1_LS), + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW2_LS, 0, PMIC_IRQ_SW2_LS), + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW3_LS, 0, PMIC_IRQ_SW3_LS), + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW1_HS, 3, PMIC_IRQ_SW1_HS), + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW2_HS, 3, PMIC_IRQ_SW2_HS), + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_SW3_HS, 3, PMIC_IRQ_SW3_HS), + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_LDO1_FAULT, 16, PMIC_IRQ_LDO1_FAULT), + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_LDO2_FAULT, 16, PMIC_IRQ_LDO2_FAULT), + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_LDO3_FAULT, 16, PMIC_IRQ_LDO3_FAULT), + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_TEMP_110, 24, PMIC_IRQ_TEMP_110), + REGMAP_IRQ_REG(PF1550_PMIC_IRQ_TEMP_125, 24, PMIC_IRQ_TEMP_125), +}; + +static const struct regmap_irq_chip pf1550_regulator_irq_chip = { + .name = "pf1550-regulator", + .status_base = PF1550_PMIC_REG_SW_INT_STAT0, + .ack_base = PF1550_PMIC_REG_SW_INT_STAT0, + .mask_base = PF1550_PMIC_REG_SW_INT_MASK0, + .use_ack = 1, + .init_ack_masked = 1, + .num_regs = 25, + .irqs = pf1550_regulator_irqs, + .num_irqs = ARRAY_SIZE(pf1550_regulator_irqs), +}; + +static const struct resource regulator_resources[] = { + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW1_LS), + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW2_LS), + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW3_LS), + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW1_HS), + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW2_HS), + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_SW3_HS), + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_LDO1_FAULT), + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_LDO2_FAULT), + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_LDO3_FAULT), + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_TEMP_110), + DEFINE_RES_IRQ(PF1550_PMIC_IRQ_TEMP_125), +}; + +static const struct regmap_irq pf1550_onkey_irqs[] = { + REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_PUSHI, 0, ONKEY_IRQ_PUSHI), + REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_1SI, 0, ONKEY_IRQ_1SI), + REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_2SI, 0, ONKEY_IRQ_2SI), + REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_3SI, 0, ONKEY_IRQ_3SI), + REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_4SI, 0, ONKEY_IRQ_4SI), + REGMAP_IRQ_REG(PF1550_ONKEY_IRQ_8SI, 0, ONKEY_IRQ_8SI), +}; + +static const struct regmap_irq_chip pf1550_onkey_irq_chip = { + .name = "pf1550-onkey", + .status_base = PF1550_PMIC_REG_ONKEY_INT_STAT0, + .ack_base = PF1550_PMIC_REG_ONKEY_INT_STAT0, + .mask_base = PF1550_PMIC_REG_ONKEY_INT_MASK0, + .use_ack = 1, + .init_ack_masked = 1, + .num_regs = 1, + .irqs = pf1550_onkey_irqs, + .num_irqs = ARRAY_SIZE(pf1550_onkey_irqs), +}; + +static const struct resource onkey_resources[] = { + DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_PUSHI), + DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_1SI), + DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_2SI), + DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_3SI), + DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_4SI), + DEFINE_RES_IRQ(PF1550_ONKEY_IRQ_8SI), +}; + +static const struct regmap_irq pf1550_charger_irqs[] = { + REGMAP_IRQ_REG(PF1550_CHARG_IRQ_BAT2SOCI, 0, CHARG_IRQ_BAT2SOCI), + REGMAP_IRQ_REG(PF1550_CHARG_IRQ_BATI, 0, CHARG_IRQ_BATI), + REGMAP_IRQ_REG(PF1550_CHARG_IRQ_CHGI, 0, CHARG_IRQ_CHGI), + REGMAP_IRQ_REG(PF1550_CHARG_IRQ_VBUSI, 0, CHARG_IRQ_VBUSI), + REGMAP_IRQ_REG(PF1550_CHARG_IRQ_THMI, 0, CHARG_IRQ_THMI), +}; + +static const struct regmap_irq_chip pf1550_charger_irq_chip = { + .name = "pf1550-charger", + .status_base = PF1550_CHARG_REG_CHG_INT, + .ack_base = PF1550_CHARG_REG_CHG_INT, + .mask_base = PF1550_CHARG_REG_CHG_INT_MASK, + .use_ack = 1, + .init_ack_masked = 1, + .num_regs = 1, + .irqs = pf1550_charger_irqs, + .num_irqs = ARRAY_SIZE(pf1550_charger_irqs), +}; + +static const struct resource charger_resources[] = { + DEFINE_RES_IRQ(PF1550_CHARG_IRQ_BAT2SOCI), + DEFINE_RES_IRQ(PF1550_CHARG_IRQ_BATI), + DEFINE_RES_IRQ(PF1550_CHARG_IRQ_CHGI), + DEFINE_RES_IRQ(PF1550_CHARG_IRQ_VBUSI), + DEFINE_RES_IRQ(PF1550_CHARG_IRQ_THMI), +}; + +static const struct mfd_cell pf1550_regulator_cell = { + .name = "pf1550-regulator", + .num_resources = ARRAY_SIZE(regulator_resources), + .resources = regulator_resources, +}; + +static const struct mfd_cell pf1550_onkey_cell = { + .name = "pf1550-onkey", + .num_resources = ARRAY_SIZE(onkey_resources), + .resources = onkey_resources, +}; + +static const struct mfd_cell pf1550_charger_cell = { + .name = "pf1550-charger", + .num_resources = ARRAY_SIZE(charger_resources), + .resources = charger_resources, +}; + +/* + * The PF1550 is shipped in variants of A0, A1,...A9. Each variant defines a + * configuration of the PMIC in a One-Time Programmable (OTP) memory. + * This memory is accessed indirectly by writing valid keys to specific + * registers of the PMIC. To read the OTP memory after writing the valid keys, + * the OTP register address to be read is written to pf1550 register 0xc4 and + * its value read from pf1550 register 0xc5. + */ +static int pf1550_read_otp(const struct pf1550_ddata *pf1550, unsigned int index, + unsigned int *val) +{ + int ret = 0; + + ret = regmap_write(pf1550->regmap, PF1550_PMIC_REG_KEY, PF1550_OTP_PMIC_KEY); + if (ret) + goto read_err; + + ret = regmap_write(pf1550->regmap, PF1550_CHARG_REG_CHGR_KEY2, PF1550_OTP_CHGR_KEY); + if (ret) + goto read_err; + + ret = regmap_write(pf1550->regmap, PF1550_TEST_REG_KEY3, PF1550_OTP_TEST_KEY); + if (ret) + goto read_err; + + ret = regmap_write(pf1550->regmap, PF1550_TEST_REG_FMRADDR, index); + if (ret) + goto read_err; + + ret = regmap_read(pf1550->regmap, PF1550_TEST_REG_FMRDATA, val); + if (ret) + goto read_err; + + return 0; + +read_err: + return dev_err_probe(pf1550->dev, ret, "OTP reg %x not found!\n", index); +} + +static int pf1550_i2c_probe(struct i2c_client *i2c) +{ + const struct mfd_cell *regulator = &pf1550_regulator_cell; + const struct mfd_cell *charger = &pf1550_charger_cell; + const struct mfd_cell *onkey = &pf1550_onkey_cell; + unsigned int reg_data = 0, otp_data = 0; + struct pf1550_ddata *pf1550; + struct irq_domain *domain; + int irq, ret = 0; + + pf1550 = devm_kzalloc(&i2c->dev, sizeof(*pf1550), GFP_KERNEL); + if (!pf1550) + return -ENOMEM; + + i2c_set_clientdata(i2c, pf1550); + pf1550->dev = &i2c->dev; + pf1550->irq = i2c->irq; + + pf1550->regmap = devm_regmap_init_i2c(i2c, &pf1550_regmap_config); + if (IS_ERR(pf1550->regmap)) + return dev_err_probe(pf1550->dev, PTR_ERR(pf1550->regmap), + "failed to allocate register map\n"); + + ret = regmap_read(pf1550->regmap, PF1550_PMIC_REG_DEVICE_ID, ®_data); + if (ret < 0) + return dev_err_probe(pf1550->dev, ret, "cannot read chip ID\n"); + if (reg_data != PF1550_DEVICE_ID) + return dev_err_probe(pf1550->dev, -ENODEV, "invalid device ID: 0x%02x\n", reg_data); + + /* Regulator DVS for SW2 */ + ret = pf1550_read_otp(pf1550, PF1550_OTP_SW2_SW3, &otp_data); + if (ret) + return ret; + + /* When clear, DVS should be enabled */ + if (!(otp_data & OTP_SW2_DVS_ENB)) + pf1550->dvs2_enable = true; + + /* Regulator DVS for SW1 */ + ret = pf1550_read_otp(pf1550, PF1550_OTP_SW1_SW2, &otp_data); + if (ret) + return ret; + + if (!(otp_data & OTP_SW1_DVS_ENB)) + pf1550->dvs1_enable = true; + + /* Add top level interrupts */ + ret = devm_regmap_add_irq_chip(pf1550->dev, pf1550->regmap, pf1550->irq, + IRQF_ONESHOT | IRQF_SHARED | + IRQF_TRIGGER_FALLING, + 0, &pf1550_irq_chip, + &pf1550->irq_data); + if (ret) + return ret; + + /* Add regulator */ + irq = regmap_irq_get_virq(pf1550->irq_data, PF1550_IRQ_REGULATOR); + if (irq < 0) + return dev_err_probe(pf1550->dev, irq, + "Failed to get parent vIRQ(%d) for chip %s\n", + PF1550_IRQ_REGULATOR, pf1550_irq_chip.name); + + ret = devm_regmap_add_irq_chip(pf1550->dev, pf1550->regmap, irq, + IRQF_ONESHOT | IRQF_SHARED | + IRQF_TRIGGER_FALLING, 0, + &pf1550_regulator_irq_chip, + &pf1550->irq_data_regulator); + if (ret) + return dev_err_probe(pf1550->dev, ret, "Failed to add %s IRQ chip\n", + pf1550_regulator_irq_chip.name); + + domain = regmap_irq_get_domain(pf1550->irq_data_regulator); + + ret = devm_mfd_add_devices(pf1550->dev, PLATFORM_DEVID_NONE, regulator, 1, NULL, 0, domain); + if (ret) + return ret; + + /* Add onkey */ + irq = regmap_irq_get_virq(pf1550->irq_data, PF1550_IRQ_ONKEY); + if (irq < 0) + return dev_err_probe(pf1550->dev, irq, + "Failed to get parent vIRQ(%d) for chip %s\n", + PF1550_IRQ_ONKEY, pf1550_irq_chip.name); + + ret = devm_regmap_add_irq_chip(pf1550->dev, pf1550->regmap, irq, + IRQF_ONESHOT | IRQF_SHARED | + IRQF_TRIGGER_FALLING, 0, + &pf1550_onkey_irq_chip, + &pf1550->irq_data_onkey); + if (ret) + return dev_err_probe(pf1550->dev, ret, "Failed to add %s IRQ chip\n", + pf1550_onkey_irq_chip.name); + + domain = regmap_irq_get_domain(pf1550->irq_data_onkey); + + ret = devm_mfd_add_devices(pf1550->dev, PLATFORM_DEVID_NONE, onkey, 1, NULL, 0, domain); + if (ret) + return ret; + + /* Add battery charger */ + irq = regmap_irq_get_virq(pf1550->irq_data, PF1550_IRQ_CHG); + if (irq < 0) + return dev_err_probe(pf1550->dev, irq, + "Failed to get parent vIRQ(%d) for chip %s\n", + PF1550_IRQ_CHG, pf1550_irq_chip.name); + + ret = devm_regmap_add_irq_chip(pf1550->dev, pf1550->regmap, irq, + IRQF_ONESHOT | IRQF_SHARED | + IRQF_TRIGGER_FALLING, 0, + &pf1550_charger_irq_chip, + &pf1550->irq_data_charger); + if (ret) + return dev_err_probe(pf1550->dev, ret, "Failed to add %s IRQ chip\n", + pf1550_charger_irq_chip.name); + + domain = regmap_irq_get_domain(pf1550->irq_data_charger); + + return devm_mfd_add_devices(pf1550->dev, PLATFORM_DEVID_NONE, charger, 1, NULL, 0, domain); +} + +static int pf1550_suspend(struct device *dev) +{ + struct pf1550_ddata *pf1550 = dev_get_drvdata(dev); + + if (device_may_wakeup(dev)) { + enable_irq_wake(pf1550->irq); + disable_irq(pf1550->irq); + } + + return 0; +} + +static int pf1550_resume(struct device *dev) +{ + struct pf1550_ddata *pf1550 = dev_get_drvdata(dev); + + if (device_may_wakeup(dev)) { + disable_irq_wake(pf1550->irq); + enable_irq(pf1550->irq); + } + + return 0; +} +static DEFINE_SIMPLE_DEV_PM_OPS(pf1550_pm, pf1550_suspend, pf1550_resume); + +static const struct i2c_device_id pf1550_i2c_id[] = { + { "pf1550" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(i2c, pf1550_i2c_id); + +static const struct of_device_id pf1550_dt_match[] = { + { .compatible = "nxp,pf1550" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, pf1550_dt_match); + +static struct i2c_driver pf1550_i2c_driver = { + .driver = { + .name = "pf1550", + .pm = pm_sleep_ptr(&pf1550_pm), + .of_match_table = pf1550_dt_match, + }, + .probe = pf1550_i2c_probe, + .id_table = pf1550_i2c_id, +}; +module_i2c_driver(pf1550_i2c_driver); + +MODULE_DESCRIPTION("NXP PF1550 core driver"); +MODULE_AUTHOR("Robin Gong "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/pf1550.h b/include/linux/mfd/pf1550.h new file mode 100644 index 000000000000..7cb2340ff2bd --- /dev/null +++ b/include/linux/mfd/pf1550.h @@ -0,0 +1,273 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Declarations for the PF1550 PMIC + * + * Copyright (C) 2016 Freescale Semiconductor, Inc. + * Robin Gong + * + * Portions Copyright (c) 2025 Savoir-faire Linux Inc. + * Samuel Kayode + */ + +#ifndef __LINUX_MFD_PF1550_H +#define __LINUX_MFD_PF1550_H + +#include +#include + +enum pf1550_pmic_reg { + /* PMIC regulator part */ + PF1550_PMIC_REG_DEVICE_ID = 0x00, + PF1550_PMIC_REG_OTP_FLAVOR = 0x01, + PF1550_PMIC_REG_SILICON_REV = 0x02, + + PF1550_PMIC_REG_INT_CATEGORY = 0x06, + PF1550_PMIC_REG_SW_INT_STAT0 = 0x08, + PF1550_PMIC_REG_SW_INT_MASK0 = 0x09, + PF1550_PMIC_REG_SW_INT_SENSE0 = 0x0a, + PF1550_PMIC_REG_SW_INT_STAT1 = 0x0b, + PF1550_PMIC_REG_SW_INT_MASK1 = 0x0c, + PF1550_PMIC_REG_SW_INT_SENSE1 = 0x0d, + PF1550_PMIC_REG_SW_INT_STAT2 = 0x0e, + PF1550_PMIC_REG_SW_INT_MASK2 = 0x0f, + PF1550_PMIC_REG_SW_INT_SENSE2 = 0x10, + PF1550_PMIC_REG_LDO_INT_STAT0 = 0x18, + PF1550_PMIC_REG_LDO_INT_MASK0 = 0x19, + PF1550_PMIC_REG_LDO_INT_SENSE0 = 0x1a, + PF1550_PMIC_REG_TEMP_INT_STAT0 = 0x20, + PF1550_PMIC_REG_TEMP_INT_MASK0 = 0x21, + PF1550_PMIC_REG_TEMP_INT_SENSE0 = 0x22, + PF1550_PMIC_REG_ONKEY_INT_STAT0 = 0x24, + PF1550_PMIC_REG_ONKEY_INT_MASK0 = 0x25, + PF1550_PMIC_REG_ONKEY_INT_SENSE0 = 0x26, + PF1550_PMIC_REG_MISC_INT_STAT0 = 0x28, + PF1550_PMIC_REG_MISC_INT_MASK0 = 0x29, + PF1550_PMIC_REG_MISC_INT_SENSE0 = 0x2a, + + PF1550_PMIC_REG_COINCELL_CONTROL = 0x30, + + PF1550_PMIC_REG_SW1_VOLT = 0x32, + PF1550_PMIC_REG_SW1_STBY_VOLT = 0x33, + PF1550_PMIC_REG_SW1_SLP_VOLT = 0x34, + PF1550_PMIC_REG_SW1_CTRL = 0x35, + PF1550_PMIC_REG_SW1_CTRL1 = 0x36, + PF1550_PMIC_REG_SW2_VOLT = 0x38, + PF1550_PMIC_REG_SW2_STBY_VOLT = 0x39, + PF1550_PMIC_REG_SW2_SLP_VOLT = 0x3a, + PF1550_PMIC_REG_SW2_CTRL = 0x3b, + PF1550_PMIC_REG_SW2_CTRL1 = 0x3c, + PF1550_PMIC_REG_SW3_VOLT = 0x3e, + PF1550_PMIC_REG_SW3_STBY_VOLT = 0x3f, + PF1550_PMIC_REG_SW3_SLP_VOLT = 0x40, + PF1550_PMIC_REG_SW3_CTRL = 0x41, + PF1550_PMIC_REG_SW3_CTRL1 = 0x42, + PF1550_PMIC_REG_VSNVS_CTRL = 0x48, + PF1550_PMIC_REG_VREFDDR_CTRL = 0x4a, + PF1550_PMIC_REG_LDO1_VOLT = 0x4c, + PF1550_PMIC_REG_LDO1_CTRL = 0x4d, + PF1550_PMIC_REG_LDO2_VOLT = 0x4f, + PF1550_PMIC_REG_LDO2_CTRL = 0x50, + PF1550_PMIC_REG_LDO3_VOLT = 0x52, + PF1550_PMIC_REG_LDO3_CTRL = 0x53, + PF1550_PMIC_REG_PWRCTRL0 = 0x58, + PF1550_PMIC_REG_PWRCTRL1 = 0x59, + PF1550_PMIC_REG_PWRCTRL2 = 0x5a, + PF1550_PMIC_REG_PWRCTRL3 = 0x5b, + PF1550_PMIC_REG_SW1_PWRDN_SEQ = 0x5f, + PF1550_PMIC_REG_SW2_PWRDN_SEQ = 0x60, + PF1550_PMIC_REG_SW3_PWRDN_SEQ = 0x61, + PF1550_PMIC_REG_LDO1_PWRDN_SEQ = 0x62, + PF1550_PMIC_REG_LDO2_PWRDN_SEQ = 0x63, + PF1550_PMIC_REG_LDO3_PWRDN_SEQ = 0x64, + PF1550_PMIC_REG_VREFDDR_PWRDN_SEQ = 0x65, + + PF1550_PMIC_REG_STATE_INFO = 0x67, + PF1550_PMIC_REG_I2C_ADDR = 0x68, + PF1550_PMIC_REG_IO_DRV0 = 0x69, + PF1550_PMIC_REG_IO_DRV1 = 0x6a, + PF1550_PMIC_REG_RC_16MHZ = 0x6b, + PF1550_PMIC_REG_KEY = 0x6f, + + /* Charger part */ + PF1550_CHARG_REG_CHG_INT = 0x80, + PF1550_CHARG_REG_CHG_INT_MASK = 0x82, + PF1550_CHARG_REG_CHG_INT_OK = 0x84, + PF1550_CHARG_REG_VBUS_SNS = 0x86, + PF1550_CHARG_REG_CHG_SNS = 0x87, + PF1550_CHARG_REG_BATT_SNS = 0x88, + PF1550_CHARG_REG_CHG_OPER = 0x89, + PF1550_CHARG_REG_CHG_TMR = 0x8a, + PF1550_CHARG_REG_CHG_EOC_CNFG = 0x8d, + PF1550_CHARG_REG_CHG_CURR_CNFG = 0x8e, + PF1550_CHARG_REG_BATT_REG = 0x8f, + PF1550_CHARG_REG_BATFET_CNFG = 0x91, + PF1550_CHARG_REG_THM_REG_CNFG = 0x92, + PF1550_CHARG_REG_VBUS_INLIM_CNFG = 0x94, + PF1550_CHARG_REG_VBUS_LIN_DPM = 0x95, + PF1550_CHARG_REG_USB_PHY_LDO_CNFG = 0x96, + PF1550_CHARG_REG_DBNC_DELAY_TIME = 0x98, + PF1550_CHARG_REG_CHG_INT_CNFG = 0x99, + PF1550_CHARG_REG_THM_ADJ_SETTING = 0x9a, + PF1550_CHARG_REG_VBUS2SYS_CNFG = 0x9b, + PF1550_CHARG_REG_LED_PWM = 0x9c, + PF1550_CHARG_REG_FAULT_BATFET_CNFG = 0x9d, + PF1550_CHARG_REG_LED_CNFG = 0x9e, + PF1550_CHARG_REG_CHGR_KEY2 = 0x9f, + + PF1550_TEST_REG_FMRADDR = 0xc4, + PF1550_TEST_REG_FMRDATA = 0xc5, + PF1550_TEST_REG_KEY3 = 0xdf, + + PF1550_PMIC_REG_END = 0xff, +}; + +/* One-Time Programmable(OTP) memory */ +enum pf1550_otp_reg { + PF1550_OTP_SW1_SW2 = 0x1e, + PF1550_OTP_SW2_SW3 = 0x1f, +}; + +#define PF1550_DEVICE_ID 0x7c + +/* Keys for reading OTP */ +#define PF1550_OTP_PMIC_KEY 0x15 +#define PF1550_OTP_CHGR_KEY 0x50 +#define PF1550_OTP_TEST_KEY 0xab + +/* Supported charger modes */ +#define PF1550_CHG_BAT_OFF 1 +#define PF1550_CHG_BAT_ON 2 + +#define PF1550_CHG_PRECHARGE 0 +#define PF1550_CHG_CONSTANT_CURRENT 1 +#define PF1550_CHG_CONSTANT_VOL 2 +#define PF1550_CHG_EOC 3 +#define PF1550_CHG_DONE 4 +#define PF1550_CHG_TIMER_FAULT 6 +#define PF1550_CHG_SUSPEND 7 +#define PF1550_CHG_OFF_INV 8 +#define PF1550_CHG_BAT_OVER 9 +#define PF1550_CHG_OFF_TEMP 10 +#define PF1550_CHG_LINEAR_ONLY 12 +#define PF1550_CHG_SNS_MASK 0xf +#define PF1550_CHG_INT_MASK 0x51 + +#define PF1550_BAT_NO_VBUS 0 +#define PF1550_BAT_LOW_THAN_PRECHARG 1 +#define PF1550_BAT_CHARG_FAIL 2 +#define PF1550_BAT_HIGH_THAN_PRECHARG 4 +#define PF1550_BAT_OVER_VOL 5 +#define PF1550_BAT_NO_DETECT 6 +#define PF1550_BAT_SNS_MASK 0x7 + +#define PF1550_VBUS_UVLO BIT(2) +#define PF1550_VBUS_IN2SYS BIT(3) +#define PF1550_VBUS_OVLO BIT(4) +#define PF1550_VBUS_VALID BIT(5) + +#define PF1550_CHARG_REG_BATT_REG_CHGCV_MASK 0x3f +#define PF1550_CHARG_REG_BATT_REG_VMINSYS_SHIFT 6 +#define PF1550_CHARG_REG_BATT_REG_VMINSYS_MASK GENMASK(7, 6) +#define PF1550_CHARG_REG_THM_REG_CNFG_REGTEMP_SHIFT 2 +#define PF1550_CHARG_REG_THM_REG_CNFG_REGTEMP_MASK GENMASK(3, 2) + +#define PF1550_ONKEY_RST_EN BIT(7) + +/* DVS enable masks */ +#define OTP_SW1_DVS_ENB BIT(1) +#define OTP_SW2_DVS_ENB BIT(3) + +/* Top level interrupt masks */ +#define IRQ_REGULATOR (BIT(1) | BIT(2) | BIT(3) | BIT(4) | BIT(6)) +#define IRQ_ONKEY BIT(5) +#define IRQ_CHG BIT(0) + +/* Regulator interrupt masks */ +#define PMIC_IRQ_SW1_LS BIT(0) +#define PMIC_IRQ_SW2_LS BIT(1) +#define PMIC_IRQ_SW3_LS BIT(2) +#define PMIC_IRQ_SW1_HS BIT(0) +#define PMIC_IRQ_SW2_HS BIT(1) +#define PMIC_IRQ_SW3_HS BIT(2) +#define PMIC_IRQ_LDO1_FAULT BIT(0) +#define PMIC_IRQ_LDO2_FAULT BIT(1) +#define PMIC_IRQ_LDO3_FAULT BIT(2) +#define PMIC_IRQ_TEMP_110 BIT(0) +#define PMIC_IRQ_TEMP_125 BIT(1) + +/* Onkey interrupt masks */ +#define ONKEY_IRQ_PUSHI BIT(0) +#define ONKEY_IRQ_1SI BIT(1) +#define ONKEY_IRQ_2SI BIT(2) +#define ONKEY_IRQ_3SI BIT(3) +#define ONKEY_IRQ_4SI BIT(4) +#define ONKEY_IRQ_8SI BIT(5) + +/* Charger interrupt masks */ +#define CHARG_IRQ_BAT2SOCI BIT(1) +#define CHARG_IRQ_BATI BIT(2) +#define CHARG_IRQ_CHGI BIT(3) +#define CHARG_IRQ_VBUSI BIT(5) +#define CHARG_IRQ_DPMI BIT(6) +#define CHARG_IRQ_THMI BIT(7) + +enum pf1550_irq { + PF1550_IRQ_CHG, + PF1550_IRQ_REGULATOR, + PF1550_IRQ_ONKEY, +}; + +enum pf1550_pmic_irq { + PF1550_PMIC_IRQ_SW1_LS, + PF1550_PMIC_IRQ_SW2_LS, + PF1550_PMIC_IRQ_SW3_LS, + PF1550_PMIC_IRQ_SW1_HS, + PF1550_PMIC_IRQ_SW2_HS, + PF1550_PMIC_IRQ_SW3_HS, + PF1550_PMIC_IRQ_LDO1_FAULT, + PF1550_PMIC_IRQ_LDO2_FAULT, + PF1550_PMIC_IRQ_LDO3_FAULT, + PF1550_PMIC_IRQ_TEMP_110, + PF1550_PMIC_IRQ_TEMP_125, +}; + +enum pf1550_onkey_irq { + PF1550_ONKEY_IRQ_PUSHI, + PF1550_ONKEY_IRQ_1SI, + PF1550_ONKEY_IRQ_2SI, + PF1550_ONKEY_IRQ_3SI, + PF1550_ONKEY_IRQ_4SI, + PF1550_ONKEY_IRQ_8SI, +}; + +enum pf1550_charg_irq { + PF1550_CHARG_IRQ_BAT2SOCI, + PF1550_CHARG_IRQ_BATI, + PF1550_CHARG_IRQ_CHGI, + PF1550_CHARG_IRQ_VBUSI, + PF1550_CHARG_IRQ_THMI, +}; + +enum pf1550_regulators { + PF1550_SW1, + PF1550_SW2, + PF1550_SW3, + PF1550_VREFDDR, + PF1550_LDO1, + PF1550_LDO2, + PF1550_LDO3, +}; + +struct pf1550_ddata { + struct regmap_irq_chip_data *irq_data_regulator; + struct regmap_irq_chip_data *irq_data_charger; + struct regmap_irq_chip_data *irq_data_onkey; + struct regmap_irq_chip_data *irq_data; + struct regmap *regmap; + struct device *dev; + bool dvs1_enable; + bool dvs2_enable; + int irq; +}; + +#endif /* __LINUX_MFD_PF1550_H */ -- cgit v1.2.3 From f7d72d0b3f438b881dba16c7c00493f16e41a821 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 19 Oct 2025 20:21:30 +0000 Subject: bpf: save the start of functions in bpf_prog_aux Introduce a new subprog_start field in bpf_prog_aux. This field may be used by JIT compilers wanting to know the real absolute xlated offset of the function being jitted. The func_info[func_id] may have served this purpose, but func_info may be NULL, so JIT compilers can't rely on it. Signed-off-by: Anton Protopopov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251019202145.3944697-3-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 204f9c759a41..3bda915cd7a8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1623,6 +1623,7 @@ struct bpf_prog_aux { u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; + u32 subprog_start; struct btf *attach_btf; struct bpf_ctx_arg_aux *ctx_arg_info; void __percpu *priv_stack_ptr; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 80c99ef4cac5..4579082068ca 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21607,6 +21607,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->func_idx = i; /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; + func[i]->aux->subprog_start = subprog_start; func[i]->aux->func_info = prog->aux->func_info; func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; -- cgit v1.2.3 From 44481e4925327d833f2e37c8741406e4cabfe054 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 19 Oct 2025 20:21:31 +0000 Subject: bpf: generalize and export map_get_next_key for arrays The kernel/bpf/array.c file defines the array_map_get_next_key() function which finds the next key for array maps. It actually doesn't use any map fields besides the generic max_entries field. Generalize it, and export as bpf_array_get_next_key() such that it can be re-used by other array-like maps. Signed-off-by: Anton Protopopov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251019202145.3944697-4-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ kernel/bpf/arraymap.c | 19 +++++++++---------- 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3bda915cd7a8..e53cda0aabb6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2107,6 +2107,12 @@ struct bpf_array { }; }; +/* + * The bpf_array_get_next_key() function may be used for all array-like + * maps, i.e., maps with u32 keys with range [0 ,..., max_entries) + */ +int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key); + #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0ba790c2d2e5..1eeb31c5b317 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -335,18 +335,17 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) } /* Called from syscall */ -static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { - struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; - if (index >= array->map.max_entries) { + if (index >= map->max_entries) { *next = 0; return 0; } - if (index == array->map.max_entries - 1) + if (index == map->max_entries - 1) return -ENOENT; *next = index + 1; @@ -789,7 +788,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, @@ -815,7 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, @@ -1204,7 +1203,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_poke_track = prog_array_map_poke_track, .map_poke_untrack = prog_array_map_poke_untrack, .map_poke_run = prog_array_map_poke_run, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, @@ -1308,7 +1307,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = perf_event_fd_array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, @@ -1344,7 +1343,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, @@ -1429,7 +1428,7 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = array_of_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, -- cgit v1.2.3 From 2f69c5685427308d2f312646779313f3677536bc Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 19 Oct 2025 20:21:37 +0000 Subject: bpf: make bpf_insn_successors to return a pointer The bpf_insn_successors() function is used to return successors to a BPF instruction. So far, an instruction could have 0, 1 or 2 successors. Prepare the verifier code to introduction of instructions with more than 2 successors (namely, indirect jumps). To do this, introduce a new struct, struct bpf_iarray, containing an array of bpf instruction indexes and make bpf_insn_successors to return a pointer of that type. The storage for all instructions is allocated in the env->succ, which holds an array of size 2, to be used for all instructions. Signed-off-by: Anton Protopopov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251019202145.3944697-10-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 12 ++++++++- kernel/bpf/liveness.c | 36 +++++++++++++++++--------- kernel/bpf/verifier.c | 60 +++++++++++++++++++++++++++++--------------- 3 files changed, 75 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b57222a25a4a..c6eb68b6389c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -509,6 +509,15 @@ struct bpf_map_ptr_state { #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) +/* + * An array of BPF instructions. + * Primary usage: return value of bpf_insn_successors. + */ +struct bpf_iarray { + int cnt; + u32 items[]; +}; + struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ @@ -828,6 +837,7 @@ struct bpf_verifier_env { /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; + struct bpf_iarray *succ; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) @@ -1050,7 +1060,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); int bpf_jmp_offset(struct bpf_insn *insn); -int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); +struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index baa742e6cbb6..bffb495bc933 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -34,7 +34,7 @@ * - read and write marks propagation. * - The propagation phase is a textbook live variable data flow analysis: * - * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)] + * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)] * state[cc, i].live_before = * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read * @@ -54,7 +54,7 @@ * The equation for "must_write_acc" propagation looks as follows: * * state[cc, i].must_write_acc = - * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)] + * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)] * U state[cc, i].must_write * * (An intersection of all "must_write_acc" for instruction successors @@ -447,7 +447,12 @@ int bpf_jmp_offset(struct bpf_insn *insn) __diag_push(); __diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl"); -inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +/* + * Returns an array of instructions succ, with succ->items[0], ..., + * succ->items[n-1] with successor instructions, where n=succ->cnt + */ +inline struct bpf_iarray * +bpf_insn_successors(struct bpf_verifier_env *env, u32 idx) { static const struct opcode_info { bool can_jump; @@ -474,19 +479,25 @@ inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}), #undef _J }; + struct bpf_prog *prog = env->prog; struct bpf_insn *insn = &prog->insnsi[idx]; const struct opcode_info *opcode_info; - int i = 0, insn_sz; + struct bpf_iarray *succ; + int insn_sz; + + /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */ + succ = env->succ; + succ->cnt = 0; opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)]; insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; if (opcode_info->can_fallthrough) - succ[i++] = idx + insn_sz; + succ->items[succ->cnt++] = idx + insn_sz; if (opcode_info->can_jump) - succ[i++] = idx + bpf_jmp_offset(insn) + 1; + succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1; - return i; + return succ; } __diag_pop(); @@ -548,11 +559,12 @@ static inline bool update_insn(struct bpf_verifier_env *env, struct bpf_insn_aux_data *aux = env->insn_aux_data; u64 new_before, new_after, must_write_acc; struct per_frame_masks *insn, *succ_insn; - u32 succ_num, s, succ[2]; + struct bpf_iarray *succ; + u32 s; bool changed; - succ_num = bpf_insn_successors(env->prog, insn_idx, succ); - if (unlikely(succ_num == 0)) + succ = bpf_insn_successors(env, insn_idx); + if (succ->cnt == 0) return false; changed = false; @@ -564,8 +576,8 @@ static inline bool update_insn(struct bpf_verifier_env *env, * of successors plus all "must_write" slots of instruction itself. */ must_write_acc = U64_MAX; - for (s = 0; s < succ_num; ++s) { - succ_insn = get_frame_masks(instance, frame, succ[s]); + for (s = 0; s < succ->cnt; ++s) { + succ_insn = get_frame_masks(instance, frame, succ->items[s]); new_after |= succ_insn->live_before; must_write_acc &= succ_insn->must_write_acc; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4579082068ca..6d175849e57a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17805,6 +17805,22 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env) return 0; } +static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem) +{ + size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]); + struct bpf_iarray *new; + + new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT); + if (!new) { + /* this is what callers always want, so simplify the call site */ + kvfree(old); + return NULL; + } + + new->cnt = n_elem; + return new; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -18025,8 +18041,9 @@ err_free: */ static int compute_postorder(struct bpf_verifier_env *env) { - u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2]; + u32 cur_postorder, i, top, stack_sz, s; int *stack = NULL, *postorder = NULL, *state = NULL; + struct bpf_iarray *succ; postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); @@ -18050,11 +18067,11 @@ static int compute_postorder(struct bpf_verifier_env *env) stack_sz--; continue; } - succ_cnt = bpf_insn_successors(env->prog, top, succ); - for (s = 0; s < succ_cnt; ++s) { - if (!state[succ[s]]) { - stack[stack_sz++] = succ[s]; - state[succ[s]] |= DISCOVERED; + succ = bpf_insn_successors(env, top); + for (s = 0; s < succ->cnt; ++s) { + if (!state[succ->items[s]]) { + stack[stack_sz++] = succ->items[s]; + state[succ->items[s]] |= DISCOVERED; } } state[top] |= EXPLORED; @@ -24313,14 +24330,13 @@ static int compute_live_registers(struct bpf_verifier_env *env) for (i = 0; i < env->cfg.cur_postorder; ++i) { int insn_idx = env->cfg.insn_postorder[i]; struct insn_live_regs *live = &state[insn_idx]; - int succ_num; - u32 succ[2]; + struct bpf_iarray *succ; u16 new_out = 0; u16 new_in = 0; - succ_num = bpf_insn_successors(env->prog, insn_idx, succ); - for (int s = 0; s < succ_num; ++s) - new_out |= state[succ[s]].in; + succ = bpf_insn_successors(env, insn_idx); + for (int s = 0; s < succ->cnt; ++s) + new_out |= state[succ->items[s]].in; new_in = (new_out & ~live->def) | live->use; if (new_out != live->out || new_in != live->in) { live->in = new_in; @@ -24373,11 +24389,11 @@ static int compute_scc(struct bpf_verifier_env *env) const u32 insn_cnt = env->prog->len; int stack_sz, dfs_sz, err = 0; u32 *stack, *pre, *low, *dfs; - u32 succ_cnt, i, j, t, w; + u32 i, j, t, w; u32 next_preorder_num; u32 next_scc_id; bool assign_scc; - u32 succ[2]; + struct bpf_iarray *succ; next_preorder_num = 1; next_scc_id = 1; @@ -24484,12 +24500,12 @@ dfs_continue: stack[stack_sz++] = w; } /* Visit 'w' successors */ - succ_cnt = bpf_insn_successors(env->prog, w, succ); - for (j = 0; j < succ_cnt; ++j) { - if (pre[succ[j]]) { - low[w] = min(low[w], low[succ[j]]); + succ = bpf_insn_successors(env, w); + for (j = 0; j < succ->cnt; ++j) { + if (pre[succ->items[j]]) { + low[w] = min(low[w], low[succ->items[j]]); } else { - dfs[dfs_sz++] = succ[j]; + dfs[dfs_sz++] = succ->items[j]; goto dfs_continue; } } @@ -24506,8 +24522,8 @@ dfs_continue: * or if component has a self reference. */ assign_scc = stack[stack_sz - 1] != w; - for (j = 0; j < succ_cnt; ++j) { - if (succ[j] == w) { + for (j = 0; j < succ->cnt; ++j) { + if (succ->items[j] == w) { assign_scc = true; break; } @@ -24569,6 +24585,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 goto err_free_env; for (i = 0; i < len; i++) env->insn_aux_data[i].orig_idx = i; + env->succ = iarray_realloc(NULL, 2); + if (!env->succ) + goto err_free_env; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; @@ -24817,6 +24836,7 @@ err_free_env: bpf_stack_liveness_free(env); kvfree(env->cfg.insn_postorder); kvfree(env->scc_info); + kvfree(env->succ); kvfree(env); return ret; } -- cgit v1.2.3 From 28098defc79fe7d29e6bfe4eb6312991f6bdc3d3 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 17 Oct 2025 03:41:52 +0000 Subject: net: add a common function to compute features for upper devices Some high level software drivers need to compute features from lower devices. But each has their own implementations and may lost some feature compute. Let's use one common function to compute features for kinds of these devices. The new helper uses the current bond implementation as the reference one, as the latter already handles all the relevant aspects: netdev features, TSO limits and dst retention. Suggested-by: Paolo Abeni Signed-off-by: Hangbin Liu Reviewed-by: Sabrina Dubroca Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20251017034155.61990-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdev_features.h | 18 +++++++++ include/linux/netdevice.h | 1 + net/core/dev.c | 88 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 7a01c518e573..93e4da7046a1 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -255,6 +255,24 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) NETIF_F_GSO_UDP_TUNNEL | \ NETIF_F_GSO_UDP_TUNNEL_CSUM) +/* virtual device features */ +#define MASTER_UPPER_DEV_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \ + NETIF_F_GSO_ENCAP_ALL | \ + NETIF_F_HIGHDMA | NETIF_F_LRO) + +#define MASTER_UPPER_DEV_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE | \ + NETIF_F_GSO_PARTIAL) + +#define MASTER_UPPER_DEV_MPLS_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_GSO_SOFTWARE) + +#define MASTER_UPPER_DEV_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ + NETIF_F_GSO_ESP) + +#define MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES (NETIF_F_GSO_ESP) + static inline netdev_features_t netdev_base_features(netdev_features_t features) { features &= ~NETIF_F_ONE_FOR_ALL; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d1a687444b27..7f5aad5cc9a1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5304,6 +5304,7 @@ static inline netdev_features_t netdev_add_tso_features(netdev_features_t featur int __netdev_update_features(struct net_device *dev); void netdev_update_features(struct net_device *dev); void netdev_change_features(struct net_device *dev); +void netdev_compute_master_upper_features(struct net_device *dev, bool update_header); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index 9482b905c66a..378c2d010faf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -12693,6 +12693,94 @@ netdev_features_t netdev_increment_features(netdev_features_t all, } EXPORT_SYMBOL(netdev_increment_features); +/** + * netdev_compute_master_upper_features - compute feature from lowers + * @dev: the upper device + * @update_header: whether to update upper device's header_len/headroom/tailroom + * + * Recompute the upper device's feature based on all lower devices. + */ +void netdev_compute_master_upper_features(struct net_device *dev, bool update_header) +{ + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; + netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES; + netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES; + netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES; + netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES; + netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES; + unsigned short max_header_len = ETH_HLEN; + unsigned int tso_max_size = TSO_MAX_SIZE; + unsigned short max_headroom = 0; + unsigned short max_tailroom = 0; + u16 tso_max_segs = TSO_MAX_SEGS; + struct net_device *lower_dev; + struct list_head *iter; + + mpls_features = netdev_base_features(mpls_features); + vlan_features = netdev_base_features(vlan_features); + enc_features = netdev_base_features(enc_features); + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + gso_partial_features = netdev_increment_features(gso_partial_features, + lower_dev->gso_partial_features, + MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES); + + vlan_features = netdev_increment_features(vlan_features, + lower_dev->vlan_features, + MASTER_UPPER_DEV_VLAN_FEATURES); + + enc_features = netdev_increment_features(enc_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_ENC_FEATURES); + + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + xfrm_features = netdev_increment_features(xfrm_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_XFRM_FEATURES); + + mpls_features = netdev_increment_features(mpls_features, + lower_dev->mpls_features, + MASTER_UPPER_DEV_MPLS_FEATURES); + + dst_release_flag &= lower_dev->priv_flags; + + if (update_header) { + max_header_len = max(max_header_len, lower_dev->hard_header_len); + max_headroom = max(max_headroom, lower_dev->needed_headroom); + max_tailroom = max(max_tailroom, lower_dev->needed_tailroom); + } + + tso_max_size = min(tso_max_size, lower_dev->tso_max_size); + tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs); + } + + dev->gso_partial_features = gso_partial_features; + dev->vlan_features = vlan_features; + dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + dev->hw_enc_features |= xfrm_features; + dev->mpls_features = mpls_features; + + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && + dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + dev->priv_flags |= IFF_XMIT_DST_RELEASE; + + if (update_header) { + dev->hard_header_len = max_header_len; + dev->needed_headroom = max_headroom; + dev->needed_tailroom = max_tailroom; + } + + netif_set_tso_max_segs(dev, tso_max_segs); + netif_set_tso_max_size(dev, tso_max_size); + + netdev_change_features(dev); +} +EXPORT_SYMBOL(netdev_compute_master_upper_features); + static struct hlist_head * __net_init netdev_create_hash(void) { int i; -- cgit v1.2.3 From 6837c006d4e72d6add451411bcf407e0dea4ad25 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Tue, 21 Oct 2025 15:24:47 +0000 Subject: firmware: exynos-acpm: add empty method to allow compile test Provide empty method for devm_acpm_get_by_node() if we aren't building in the CONFIG_EXYNOS_ACPM_PROTOCOL. This allows to test-build the CONFIG_EXYNOS_ACPM_CLK code. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202510211905.RgfWkgss-lkp@intel.com/ Fixes: 40498a742053 ("clk: samsung: add Exynos ACPM clock driver") Signed-off-by: Tudor Ambarus Link: https://patch.msgid.link/20251021-fix-acpm-clk-build-test-v1-1-236a3d6db7f5@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/linux/firmware/samsung/exynos-acpm-protocol.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index b1e95435240f..2091da965a5a 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -55,7 +55,16 @@ struct acpm_handle { struct device; +#if IS_ENABLED(CONFIG_EXYNOS_ACPM_PROTOCOL) const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, struct device_node *np); +#else + +static inline const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np) +{ + return NULL; +} +#endif #endif /* __EXYNOS_ACPM_PROTOCOL_H */ -- cgit v1.2.3 From fdd00d79dc0e8a3f90be65d5060c55bb115c0f43 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Oct 2025 20:35:43 -0700 Subject: ipack: fix ipack.h kernel-doc warnings Fix various kernel-doc warnings in ipack.h: - Remove an empty kernel-doc comment. - Add 2 missing struct short descriptions. - Fix a typo in a description. - Add a missing struct field description. - Add some missing Return descriptions. - Clarify one function short description. Warning: ../include/linux/ipack.h:73 Cannot find identifier on line: */ Warning: ../include/linux/ipack.h:74 Cannot find identifier on line: struct ipack_region { Warning: ../include/linux/ipack.h:75 Cannot find identifier on line: phys_addr_t start; Warning: ../include/linux/ipack.h:76 Cannot find identifier on line: size_t size; Warning: ../include/linux/ipack.h:77 Cannot find identifier on line: }; Warning: ../include/linux/ipack.h:78 Cannot find identifier on line: Warning: ../include/linux/ipack.h:79 Cannot find identifier on line: /** Warning: ipack.h:80 missing initial short description on line: * struct ipack_device Warning: ipack.h:163 missing initial short description on line: * struct ipack_bus_device Warning: ipack.h:130 struct member 'id_table' not described in 'ipack_driver' Warning: ipack.h:189 No description found for return value of 'ipack_bus_register' Warning: ipack.h:194 No description found for return value of 'ipack_bus_unregister' *** Warning: ipack.h:202 No description found for return value of 'ipack_driver_register' Warning: ipack.h:221 No description found for return value of 'ipack_device_init' Warning: ipack.h:236 No description found for return value of 'ipack_device_add' Warning: ipack.h:271 No description found for return value of 'ipack_get_carrier' Signed-off-by: Randy Dunlap Acked-by: Vaibhav Gupta Link: https://patch.msgid.link/20251016033543.1142049-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/ipack.h | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipack.h b/include/linux/ipack.h index 2c6936b8371f..455f6c2a1903 100644 --- a/include/linux/ipack.h +++ b/include/linux/ipack.h @@ -70,15 +70,13 @@ enum ipack_space { IPACK_SPACE_COUNT, }; -/** - */ struct ipack_region { phys_addr_t start; size_t size; }; /** - * struct ipack_device + * struct ipack_device - subsystem representation of an IPack device * * @slot: Slot where the device is plugged in the carrier board * @bus: ipack_bus_device where the device is plugged to. @@ -89,7 +87,7 @@ struct ipack_region { * * Warning: Direct access to mapped memory is possible but the endianness * is not the same with PCI carrier or VME carrier. The endianness is managed - * by the carrier board throught bus->ops. + * by the carrier board through bus->ops. */ struct ipack_device { unsigned int slot; @@ -124,6 +122,7 @@ struct ipack_driver_ops { * struct ipack_driver -- Specific data to each ipack device driver * * @driver: Device driver kernel representation + * @id_table: Device ID table for this driver * @ops: Callbacks provided by the IPack device driver */ struct ipack_driver { @@ -161,7 +160,7 @@ struct ipack_bus_ops { }; /** - * struct ipack_bus_device + * struct ipack_bus_device - IPack bus representation * * @dev: pointer to carrier device * @slots: number of slots available @@ -185,6 +184,8 @@ struct ipack_bus_device { * * The carrier board device should call this function to register itself as * available bus device in ipack. + * + * Return: %NULL on error or &struct ipack_bus_device on success */ struct ipack_bus_device *ipack_bus_register(struct device *parent, int slots, const struct ipack_bus_ops *ops, @@ -192,6 +193,8 @@ struct ipack_bus_device *ipack_bus_register(struct device *parent, int slots, /** * ipack_bus_unregister -- unregister an ipack bus + * + * Return: %0 */ int ipack_bus_unregister(struct ipack_bus_device *bus); @@ -200,6 +203,8 @@ int ipack_bus_unregister(struct ipack_bus_device *bus); * * Called by a ipack driver to register itself as a driver * that can manage ipack devices. + * + * Return: zero on success or error code on failure. */ int ipack_driver_register(struct ipack_driver *edrv, struct module *owner, const char *name); @@ -215,7 +220,7 @@ void ipack_driver_unregister(struct ipack_driver *edrv); * function. The rest of the fields will be allocated and populated * during initalization. * - * Return zero on success or error code on failure. + * Return: zero on success or error code on failure. * * NOTE: _Never_ directly free @dev after calling this function, even * if it returned an error! Always use ipack_put_device() to give up the @@ -230,7 +235,7 @@ int ipack_device_init(struct ipack_device *dev); * Add a new IPack device. The call is done by the carrier driver * after calling ipack_device_init(). * - * Return zero on success or error code on failure. + * Return: zero on success or error code on failure. * * NOTE: _Never_ directly free @dev after calling this function, even * if it returned an error! Always use ipack_put_device() to give up the @@ -266,9 +271,11 @@ void ipack_put_device(struct ipack_device *dev); .device = (dev) /** - * ipack_get_carrier - it increase the carrier ref. counter of + * ipack_get_carrier - try to increase the carrier ref. counter of * the carrier module * @dev: mezzanine device which wants to get the carrier + * + * Return: true on success. */ static inline int ipack_get_carrier(struct ipack_device *dev) { -- cgit v1.2.3 From 9fd2eb9e18a0a0b5a127937586388ed0181d9dac Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 14 Sep 2025 15:42:40 +0200 Subject: cdx: make cdx_bus_type constant Now that the driver core can properly handle constant struct bus_type, move the cdx_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Nipun Gupta Cc: Nikhil Agarwal Acked-by: Nipun Gupta Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2025091439-sustained-acorn-4af4@gregkh Signed-off-by: Greg Kroah-Hartman --- drivers/cdx/cdx.c | 4 ++-- include/linux/cdx/cdx_bus.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c index 3d50f8cd9c0b..b39af2f1937f 100644 --- a/drivers/cdx/cdx.c +++ b/drivers/cdx/cdx.c @@ -170,7 +170,7 @@ static int cdx_unregister_device(struct device *dev, return 0; } -static void cdx_unregister_devices(struct bus_type *bus) +static void cdx_unregister_devices(const struct bus_type *bus) { /* Reset all the devices attached to cdx bus */ bus_for_each_dev(bus, NULL, NULL, cdx_unregister_device); @@ -651,7 +651,7 @@ static struct attribute *cdx_bus_attrs[] = { }; ATTRIBUTE_GROUPS(cdx_bus); -struct bus_type cdx_bus_type = { +const struct bus_type cdx_bus_type = { .name = "cdx", .match = cdx_bus_match, .probe = cdx_probe, diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index 79bb80e56790..b1ba97f6c9ad 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -234,7 +234,7 @@ int __must_check __cdx_driver_register(struct cdx_driver *cdx_driver, */ void cdx_driver_unregister(struct cdx_driver *cdx_driver); -extern struct bus_type cdx_bus_type; +extern const struct bus_type cdx_bus_type; /** * cdx_dev_reset - Reset CDX device -- cgit v1.2.3 From 61e606305672342858a647af3629d9dfcc4e4265 Mon Sep 17 00:00:00 2001 From: Adrian Barnaś Date: Fri, 19 Sep 2025 06:53:27 +0000 Subject: drivers: eisa: make eisa_bus_type const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because driver core can properly handle constant struct bus_type, move the eisa_bus_type to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Signed-off-by: Adrian Barnaś Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250919065327.672924-1-abarnas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/eisa/eisa-bus.c | 2 +- include/linux/eisa.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/eisa/eisa-bus.c b/drivers/eisa/eisa-bus.c index edceea083b98..bd76d599109c 100644 --- a/drivers/eisa/eisa-bus.c +++ b/drivers/eisa/eisa-bus.c @@ -135,7 +135,7 @@ static int eisa_bus_uevent(const struct device *dev, struct kobj_uevent_env *env return 0; } -struct bus_type eisa_bus_type = { +const struct bus_type eisa_bus_type = { .name = "eisa", .match = eisa_bus_match, .uevent = eisa_bus_uevent, diff --git a/include/linux/eisa.h b/include/linux/eisa.h index 21a2ecc1e538..cf55630b595b 100644 --- a/include/linux/eisa.h +++ b/include/linux/eisa.h @@ -68,7 +68,7 @@ struct eisa_driver { /* These external functions are only available when EISA support is enabled. */ #ifdef CONFIG_EISA -extern struct bus_type eisa_bus_type; +extern const struct bus_type eisa_bus_type; int eisa_driver_register (struct eisa_driver *edrv); void eisa_driver_unregister (struct eisa_driver *edrv); -- cgit v1.2.3 From 8ce6b508f24b4ef3a78c2c0d92e67b9e324c4f7a Mon Sep 17 00:00:00 2001 From: Adrian Barnaś Date: Fri, 19 Sep 2025 07:32:01 +0000 Subject: drivers: rapidio: make rio_bus_type const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because driver core can properly handle constant struct bus_type, move the rio_bus_type to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Signed-off-by: Adrian Barnaś Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250919073201.751348-1-abarnas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/rapidio/rio-driver.c | 2 +- include/linux/rio.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c index 238250e69005..bcfe0b45b377 100644 --- a/drivers/rapidio/rio-driver.c +++ b/drivers/rapidio/rio-driver.c @@ -227,7 +227,7 @@ struct class rio_mport_class = { }; EXPORT_SYMBOL_GPL(rio_mport_class); -struct bus_type rio_bus_type = { +const struct bus_type rio_bus_type = { .name = "rapidio", .match = rio_match_bus, .dev_groups = rio_dev_groups, diff --git a/include/linux/rio.h b/include/linux/rio.h index 3c29f40f3c94..2c29f21ba9e5 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -78,7 +78,7 @@ #define RIO_CTAG_RESRVD 0xfffe0000 /* Reserved */ #define RIO_CTAG_UDEVID 0x0001ffff /* Unique device identifier */ -extern struct bus_type rio_bus_type; +extern const struct bus_type rio_bus_type; extern struct class rio_mport_class; struct rio_mport; -- cgit v1.2.3 From cebd22dd3a0ac76e0e1f2f369bba710bc6b1dc66 Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Thu, 18 Sep 2025 10:54:02 +0800 Subject: platform: Use IOMEM_ERR_PTR for ioremap error returns Replace ERR_PTR() with IOMEM_ERR_PTR() in stubbed ioremap functions to maintain type consistency. The functions return void __iomem * pointers and IOMEM_ERR_PTR() provides proper type casting to avoid sparse warnings. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509060307.JubgnLhc-lkp@intel.com/ Signed-off-by: Pei Xiao Link: https://patch.msgid.link/320f2cc9ada5cb66845daa6bf259000b4cffd8b3.1758163939.git.xiaopei01@kylinos.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_device.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 074754c23d33..1d424fed1435 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -80,7 +80,7 @@ static inline void __iomem * devm_platform_get_and_ioremap_resource(struct platform_device *pdev, unsigned int index, struct resource **res) { - return ERR_PTR(-EINVAL); + return IOMEM_ERR_PTR(-EINVAL); } @@ -88,14 +88,14 @@ static inline void __iomem * devm_platform_ioremap_resource(struct platform_device *pdev, unsigned int index) { - return ERR_PTR(-EINVAL); + return IOMEM_ERR_PTR(-EINVAL); } static inline void __iomem * devm_platform_ioremap_resource_byname(struct platform_device *pdev, const char *name) { - return ERR_PTR(-EINVAL); + return IOMEM_ERR_PTR(-EINVAL); } #endif -- cgit v1.2.3 From 6d0ef68955d30be1e218caf160ec32eec23ebc6e Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Tue, 23 Sep 2025 09:54:09 +0800 Subject: arch_topology: move parse_acpi_topology() to common code Currently, RISC-V lacks arch-specific registers for CPU topology properties and must get them from ACPI. Thus, parse_acpi_topology() is moved from arm64/ to drivers/ for RISC-V reuse. Signed-off-by: Yunhui Cui Reviewed-by: Sudeep Holla Link: https://patch.msgid.link/20250923015409.15983-2-cuiyunhui@bytedance.com Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/topology.h | 3 ++ arch/arm64/kernel/topology.c | 101 -------------------------------------- drivers/base/arch_topology.c | 96 +++++++++++++++++++++++++++++++++++- include/linux/arch_topology.h | 5 ++ 4 files changed, 103 insertions(+), 102 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 341174bf9106..b9eaf4ad7085 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -36,6 +36,9 @@ void update_freq_counters_refs(void); #define arch_scale_hw_pressure topology_get_hw_pressure #define arch_update_hw_pressure topology_update_hw_pressure +#undef arch_cpu_is_threaded +#define arch_cpu_is_threaded() (read_cpuid_mpidr() & MPIDR_MT_BITMASK) + #include #endif /* _ASM_ARM_TOPOLOGY_H */ diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 5d07ee85bdae..5d24dc53799b 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -25,107 +25,6 @@ #include #include -#ifdef CONFIG_ACPI -static bool __init acpi_cpu_is_threaded(int cpu) -{ - int is_threaded = acpi_pptt_cpu_is_thread(cpu); - - /* - * if the PPTT doesn't have thread information, assume a homogeneous - * machine and return the current CPU's thread state. - */ - if (is_threaded < 0) - is_threaded = read_cpuid_mpidr() & MPIDR_MT_BITMASK; - - return !!is_threaded; -} - -struct cpu_smt_info { - unsigned int thread_num; - int core_id; -}; - -/* - * Propagate the topology information of the processor_topology_node tree to the - * cpu_topology array. - */ -int __init parse_acpi_topology(void) -{ - unsigned int max_smt_thread_num = 1; - struct cpu_smt_info *entry; - struct xarray hetero_cpu; - unsigned long hetero_id; - int cpu, topology_id; - - if (acpi_disabled) - return 0; - - xa_init(&hetero_cpu); - - for_each_possible_cpu(cpu) { - topology_id = find_acpi_cpu_topology(cpu, 0); - if (topology_id < 0) - return topology_id; - - if (acpi_cpu_is_threaded(cpu)) { - cpu_topology[cpu].thread_id = topology_id; - topology_id = find_acpi_cpu_topology(cpu, 1); - cpu_topology[cpu].core_id = topology_id; - - /* - * In the PPTT, CPUs below a node with the 'identical - * implementation' flag have the same number of threads. - * Count the number of threads for only one CPU (i.e. - * one core_id) among those with the same hetero_id. - * See the comment of find_acpi_cpu_topology_hetero_id() - * for more details. - * - * One entry is created for each node having: - * - the 'identical implementation' flag - * - its parent not having the flag - */ - hetero_id = find_acpi_cpu_topology_hetero_id(cpu); - entry = xa_load(&hetero_cpu, hetero_id); - if (!entry) { - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - WARN_ON_ONCE(!entry); - - if (entry) { - entry->core_id = topology_id; - entry->thread_num = 1; - xa_store(&hetero_cpu, hetero_id, - entry, GFP_KERNEL); - } - } else if (entry->core_id == topology_id) { - entry->thread_num++; - } - } else { - cpu_topology[cpu].thread_id = -1; - cpu_topology[cpu].core_id = topology_id; - } - topology_id = find_acpi_cpu_topology_cluster(cpu); - cpu_topology[cpu].cluster_id = topology_id; - topology_id = find_acpi_cpu_topology_package(cpu); - cpu_topology[cpu].package_id = topology_id; - } - - /* - * This is a short loop since the number of XArray elements is the - * number of heterogeneous CPU clusters. On a homogeneous system - * there's only one entry in the XArray. - */ - xa_for_each(&hetero_cpu, hetero_id, entry) { - max_smt_thread_num = max(max_smt_thread_num, entry->thread_num); - xa_erase(&hetero_cpu, hetero_id); - kfree(entry); - } - - cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); - xa_destroy(&hetero_cpu); - return 0; -} -#endif - #ifdef CONFIG_ARM64_AMU_EXTN #define read_corecnt() read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0) #define read_constcnt() read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 1037169abb45..1ccb1eda4ce8 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -823,12 +823,106 @@ void remove_cpu_topology(unsigned int cpu) clear_cpu_topology(cpu); } +#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) +struct cpu_smt_info { + unsigned int thread_num; + int core_id; +}; + +static bool __init acpi_cpu_is_threaded(int cpu) +{ + int is_threaded = acpi_pptt_cpu_is_thread(cpu); + + /* + * if the PPTT doesn't have thread information, check for architecture + * specific fallback if available + */ + if (is_threaded < 0) + is_threaded = arch_cpu_is_threaded(); + + return !!is_threaded; +} + +/* + * Propagate the topology information of the processor_topology_node tree to the + * cpu_topology array. + */ __weak int __init parse_acpi_topology(void) { + unsigned int max_smt_thread_num = 1; + struct cpu_smt_info *entry; + struct xarray hetero_cpu; + unsigned long hetero_id; + int cpu, topology_id; + + if (acpi_disabled) + return 0; + + xa_init(&hetero_cpu); + + for_each_possible_cpu(cpu) { + topology_id = find_acpi_cpu_topology(cpu, 0); + if (topology_id < 0) + return topology_id; + + if (acpi_cpu_is_threaded(cpu)) { + cpu_topology[cpu].thread_id = topology_id; + topology_id = find_acpi_cpu_topology(cpu, 1); + cpu_topology[cpu].core_id = topology_id; + + /* + * In the PPTT, CPUs below a node with the 'identical + * implementation' flag have the same number of threads. + * Count the number of threads for only one CPU (i.e. + * one core_id) among those with the same hetero_id. + * See the comment of find_acpi_cpu_topology_hetero_id() + * for more details. + * + * One entry is created for each node having: + * - the 'identical implementation' flag + * - its parent not having the flag + */ + hetero_id = find_acpi_cpu_topology_hetero_id(cpu); + entry = xa_load(&hetero_cpu, hetero_id); + if (!entry) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + WARN_ON_ONCE(!entry); + + if (entry) { + entry->core_id = topology_id; + entry->thread_num = 1; + xa_store(&hetero_cpu, hetero_id, + entry, GFP_KERNEL); + } + } else if (entry->core_id == topology_id) { + entry->thread_num++; + } + } else { + cpu_topology[cpu].thread_id = -1; + cpu_topology[cpu].core_id = topology_id; + } + topology_id = find_acpi_cpu_topology_cluster(cpu); + cpu_topology[cpu].cluster_id = topology_id; + topology_id = find_acpi_cpu_topology_package(cpu); + cpu_topology[cpu].package_id = topology_id; + } + + /* + * This is a short loop since the number of XArray elements is the + * number of heterogeneous CPU clusters. On a homogeneous system + * there's only one entry in the XArray. + */ + xa_for_each(&hetero_cpu, hetero_id, entry) { + max_smt_thread_num = max(max_smt_thread_num, entry->thread_num); + xa_erase(&hetero_cpu, hetero_id); + kfree(entry); + } + + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); + xa_destroy(&hetero_cpu); return 0; } -#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) void __init init_cpu_topology(void) { int cpu, ret; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d72d6e5aa200..766ed9cf0e54 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -80,6 +80,11 @@ extern struct cpu_topology cpu_topology[NR_CPUS]; #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) #define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) + +#ifndef arch_cpu_is_threaded +#define arch_cpu_is_threaded() (0) +#endif + void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); -- cgit v1.2.3 From 00aaae60faf554c27c95e93d47f200a93ff266ef Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 14 Oct 2025 18:53:53 +0300 Subject: gpio: regmap: add the .fixed_direction_output configuration parameter There are GPIO controllers such as the one present in the LX2160ARDB QIXIS FPGA which have fixed-direction input and output GPIO lines mixed together in a single register. This cannot be modeled using the gpio-regmap as-is since there is no way to present the true direction of a GPIO line. In order to make this use case possible, add a new configuration parameter - fixed_direction_output - into the gpio_regmap_config structure. This will enable user drivers to provide a bitmap that represents the fixed direction of the GPIO lines. Signed-off-by: Ioana Ciornei Acked-by: Bartosz Golaszewski Reviewed-by: Michael Walle Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-regmap.c | 26 ++++++++++++++++++++++++-- include/linux/gpio/regmap.h | 5 +++++ 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c index ab9e4077fa60..f4267af00027 100644 --- a/drivers/gpio/gpio-regmap.c +++ b/drivers/gpio/gpio-regmap.c @@ -31,6 +31,7 @@ struct gpio_regmap { unsigned int reg_clr_base; unsigned int reg_dir_in_base; unsigned int reg_dir_out_base; + unsigned long *fixed_direction_output; #ifdef CONFIG_REGMAP_IRQ int regmap_irq_line; @@ -134,6 +135,13 @@ static int gpio_regmap_get_direction(struct gpio_chip *chip, unsigned int base, val, reg, mask; int invert, ret; + if (gpio->fixed_direction_output) { + if (test_bit(offset, gpio->fixed_direction_output)) + return GPIO_LINE_DIRECTION_OUT; + else + return GPIO_LINE_DIRECTION_IN; + } + if (gpio->reg_dat_base && !gpio->reg_set_base) return GPIO_LINE_DIRECTION_IN; if (gpio->reg_set_base && !gpio->reg_dat_base) @@ -284,6 +292,17 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config goto err_free_gpio; } + if (config->fixed_direction_output) { + gpio->fixed_direction_output = bitmap_alloc(chip->ngpio, + GFP_KERNEL); + if (!gpio->fixed_direction_output) { + ret = -ENOMEM; + goto err_free_gpio; + } + bitmap_copy(gpio->fixed_direction_output, + config->fixed_direction_output, chip->ngpio); + } + /* if not set, assume there is only one register */ gpio->ngpio_per_reg = config->ngpio_per_reg; if (!gpio->ngpio_per_reg) @@ -300,7 +319,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config ret = gpiochip_add_data(chip, gpio); if (ret < 0) - goto err_free_gpio; + goto err_free_bitmap; #ifdef CONFIG_REGMAP_IRQ if (config->regmap_irq_chip) { @@ -309,7 +328,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config config->regmap_irq_line, config->regmap_irq_flags, 0, config->regmap_irq_chip, &gpio->irq_chip_data); if (ret) - goto err_free_gpio; + goto err_free_bitmap; irq_domain = regmap_irq_get_domain(gpio->irq_chip_data); } else @@ -326,6 +345,8 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config err_remove_gpiochip: gpiochip_remove(chip); +err_free_bitmap: + bitmap_free(gpio->fixed_direction_output); err_free_gpio: kfree(gpio); return ERR_PTR(ret); @@ -344,6 +365,7 @@ void gpio_regmap_unregister(struct gpio_regmap *gpio) #endif gpiochip_remove(&gpio->gpio_chip); + bitmap_free(gpio->fixed_direction_output); kfree(gpio); } EXPORT_SYMBOL_GPL(gpio_regmap_unregister); diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index 622a2939ebe0..87983a5f3681 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -38,6 +38,10 @@ struct regmap; * offset to a register/bitmask pair. If not * given the default gpio_regmap_simple_xlate() * is used. + * @fixed_direction_output: + * (Optional) Bitmap representing the fixed direction of + * the GPIO lines. Useful when there are GPIO lines with a + * fixed direction mixed together in the same register. * @drvdata: (Optional) Pointer to driver specific data which is * not used by gpio-remap but is provided "as is" to the * driver callback(s). @@ -85,6 +89,7 @@ struct gpio_regmap_config { int reg_stride; int ngpio_per_reg; struct irq_domain *irq_domain; + unsigned long *fixed_direction_output; #ifdef CONFIG_REGMAP_IRQ struct regmap_irq_chip *regmap_irq_chip; -- cgit v1.2.3 From f82890c98f3e3fd61983e9021354c632ecd47427 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Wed, 15 Oct 2025 04:30:13 +0000 Subject: tcpm: Parse and log AVS APDO The USB PD specification introduced new Adjustable Voltage Supply (AVS) types for both Standard Power Range (SPR) and Extended Power Range (EPR) sources. Add definitions to correctly parse and handle the new AVS APDO. Use bitfield macros to add inline helper functions to extract voltage, current, power, and peak current fields to parse and log the details of the new EPR AVS and SPR AVS APDO. Signed-off-by: Badhri Jagan Sridharan Reviewed-by: Amit Sunil Dhamne Reviewed-by: Kyle Tso Reviewed-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20251015043017.3382908-1-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 15 +++++++++- include/linux/usb/pd.h | 69 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index b2a568a5bc9b..c65aa8104950 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -823,10 +823,23 @@ static void tcpm_log_source_caps(struct tcpm_port *port) case PDO_TYPE_APDO: if (pdo_apdo_type(pdo) == APDO_TYPE_PPS) scnprintf(msg, sizeof(msg), - "%u-%u mV, %u mA", + "PPS %u-%u mV, %u mA", pdo_pps_apdo_min_voltage(pdo), pdo_pps_apdo_max_voltage(pdo), pdo_pps_apdo_max_current(pdo)); + else if (pdo_apdo_type(pdo) == APDO_TYPE_EPR_AVS) + scnprintf(msg, sizeof(msg), + "EPR AVS %u-%u mV %u W peak_current: %u", + pdo_epr_avs_apdo_min_voltage_mv(pdo), + pdo_epr_avs_apdo_max_voltage_mv(pdo), + pdo_epr_avs_apdo_pdp_w(pdo), + pdo_epr_avs_apdo_src_peak_current(pdo)); + else if (pdo_apdo_type(pdo) == APDO_TYPE_SPR_AVS) + scnprintf(msg, sizeof(msg), + "SPR AVS 9-15 V: %u mA 15-20 V: %u mA peak_current: %u", + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(pdo), + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(pdo), + pdo_spr_avs_apdo_src_peak_current(pdo)); else strcpy(msg, "undefined APDO"); break; diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index 3068c3084eb6..6ccd1b2af993 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -6,6 +6,7 @@ #ifndef __LINUX_USB_PD_H #define __LINUX_USB_PD_H +#include #include #include #include @@ -271,9 +272,11 @@ enum pd_pdo_type { enum pd_apdo_type { APDO_TYPE_PPS = 0, + APDO_TYPE_EPR_AVS = 1, + APDO_TYPE_SPR_AVS = 2, }; -#define PDO_APDO_TYPE_SHIFT 28 /* Only valid value currently is 0x0 - PPS */ +#define PDO_APDO_TYPE_SHIFT 28 #define PDO_APDO_TYPE_MASK 0x3 #define PDO_APDO_TYPE(t) ((t) << PDO_APDO_TYPE_SHIFT) @@ -297,6 +300,35 @@ enum pd_apdo_type { PDO_PPS_APDO_MIN_VOLT(min_mv) | PDO_PPS_APDO_MAX_VOLT(max_mv) | \ PDO_PPS_APDO_MAX_CURR(max_ma)) +/* + * Applicable only to EPR AVS APDO source cap as per + * Table 6.15 EPR Adjustable Voltage Supply APDO – Source + */ +#define PDO_EPR_AVS_APDO_PEAK_CURRENT GENMASK(27, 26) + +/* + * Applicable to both EPR AVS APDO source and sink cap as per + * Table 6.15 EPR Adjustable Voltage Supply APDO – Source + * Table 6.22 EPR Adjustable Voltage Supply APDO – Sink + */ +#define PDO_EPR_AVS_APDO_MAX_VOLT GENMASK(25, 17) /* 100mV unit */ +#define PDO_EPR_AVS_APDO_MIN_VOLT GENMASK(15, 8) /* 100mV unit */ +#define PDO_EPR_AVS_APDO_PDP GENMASK(7, 0) /* 1W unit */ + +/* + * Applicable only SPR AVS APDO source cap as per + * Table 6.14 SPR Adjustable Voltage Supply APDO – Source + */ +#define PDO_SPR_AVS_APDO_PEAK_CURRENT GENMASK(27, 26) + +/* + * Applicable to both SPR AVS APDO source and sink cap as per + * Table 6.14 SPR Adjustable Voltage Supply APDO – Source + * Table 6.21 SPR Adjustable Voltage Supply APDO – Sink + */ +#define PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR GENMASK(19, 10) /* 10mA unit */ +#define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR GENMASK(9, 0) /* 10mA unit */ + static inline enum pd_pdo_type pdo_type(u32 pdo) { return (pdo >> PDO_TYPE_SHIFT) & PDO_TYPE_MASK; @@ -350,6 +382,41 @@ static inline unsigned int pdo_pps_apdo_max_current(u32 pdo) PDO_PPS_APDO_CURR_MASK) * 50; } +static inline unsigned int pdo_epr_avs_apdo_src_peak_current(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_PEAK_CURRENT, pdo); +} + +static inline unsigned int pdo_epr_avs_apdo_min_voltage_mv(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_MIN_VOLT, pdo) * 100; +} + +static inline unsigned int pdo_epr_avs_apdo_max_voltage_mv(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_MIN_VOLT, pdo) * 100; +} + +static inline unsigned int pdo_epr_avs_apdo_pdp_w(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_PDP, pdo); +} + +static inline unsigned int pdo_spr_avs_apdo_src_peak_current(u32 pdo) +{ + return FIELD_GET(PDO_SPR_AVS_APDO_PEAK_CURRENT, pdo); +} + +static inline unsigned int pdo_spr_avs_apdo_9v_to_15v_max_current_ma(u32 pdo) +{ + return FIELD_GET(PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR, pdo) * 10; +} + +static inline unsigned int pdo_spr_avs_apdo_15v_to_20v_max_current_ma(u32 pdo) +{ + return FIELD_GET(PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR, pdo) * 10; +} + /* RDO: Request Data Object */ #define RDO_OBJ_POS_SHIFT 28 #define RDO_OBJ_POS_MASK 0x7 -- cgit v1.2.3 From 832c8d3fce77cf03cc225fc555c1bffa1c547ba1 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Tue, 14 Oct 2025 18:06:47 +0200 Subject: usb: typec: ps883x: Add USB4 mode and TBT3 altmode support This chip can do some more than the driver currently describes. Add support for configuring it for various flavors of TBT3/USB4 operation. Reviewed-by: Jack Pham Signed-off-by: Konrad Dybcio Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20251014-topic-ps883x_usb4-v1-3-e6adb1a4296e@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/mux/ps883x.c | 29 +++++++++++++++++++++++++++++ include/linux/usb/typec_tbt.h | 1 + 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/typec/mux/ps883x.c b/drivers/usb/typec/mux/ps883x.c index 72f1e737ca4b..7c61629b36d6 100644 --- a/drivers/usb/typec/mux/ps883x.c +++ b/drivers/usb/typec/mux/ps883x.c @@ -14,15 +14,18 @@ #include #include #include +#include #include #include #include #include +#include #define REG_USB_PORT_CONN_STATUS_0 0x00 #define CONN_STATUS_0_CONNECTION_PRESENT BIT(0) #define CONN_STATUS_0_ORIENTATION_REVERSED BIT(1) +#define CONN_STATUS_0_ACTIVE_CABLE BIT(2) #define CONN_STATUS_0_USB_3_1_CONNECTED BIT(5) #define REG_USB_PORT_CONN_STATUS_1 0x01 @@ -34,6 +37,10 @@ #define REG_USB_PORT_CONN_STATUS_2 0x02 +#define CONN_STATUS_2_TBT_CONNECTED BIT(0) +#define CONN_STATUS_2_TBT_UNIDIR_LSRX_ACT_LT BIT(4) +#define CONN_STATUS_2_USB4_CONNECTED BIT(7) + struct ps883x_retimer { struct i2c_client *client; struct gpio_desc *reset_gpio; @@ -95,6 +102,8 @@ static int ps883x_configure(struct ps883x_retimer *retimer, int cfg0, static int ps883x_set(struct ps883x_retimer *retimer, struct typec_retimer_state *state) { + struct typec_thunderbolt_data *tb_data; + const struct enter_usb_data *eudo_data; int cfg0 = CONN_STATUS_0_CONNECTION_PRESENT; int cfg1 = 0x00; int cfg2 = 0x00; @@ -120,6 +129,18 @@ static int ps883x_set(struct ps883x_retimer *retimer, struct typec_retimer_state break; } break; + case USB_TYPEC_TBT_SID: + tb_data = state->data; + + /* Unconditional */ + cfg2 |= CONN_STATUS_2_TBT_CONNECTED; + + if (tb_data->cable_mode & TBT_CABLE_ACTIVE_PASSIVE) + cfg0 |= CONN_STATUS_0_ACTIVE_CABLE; + + if (tb_data->enter_vdo & TBT_ENTER_MODE_UNI_DIR_LSRX) + cfg2 |= CONN_STATUS_2_TBT_UNIDIR_LSRX_ACT_LT; + break; default: dev_err(&retimer->client->dev, "Got unsupported SID: 0x%x\n", state->alt->svid); @@ -135,6 +156,14 @@ static int ps883x_set(struct ps883x_retimer *retimer, struct typec_retimer_state case TYPEC_MODE_USB3: cfg0 |= CONN_STATUS_0_USB_3_1_CONNECTED; break; + case TYPEC_MODE_USB4: + eudo_data = state->data; + + cfg2 |= CONN_STATUS_2_USB4_CONNECTED; + + if (FIELD_GET(EUDO_CABLE_TYPE_MASK, eudo_data->eudo) != EUDO_CABLE_TYPE_PASSIVE) + cfg0 |= CONN_STATUS_0_ACTIVE_CABLE; + break; default: dev_err(&retimer->client->dev, "Got unsupported mode: %lu\n", state->mode); diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h index 55dcea12082c..0b570f1b8bc8 100644 --- a/include/linux/usb/typec_tbt.h +++ b/include/linux/usb/typec_tbt.h @@ -55,6 +55,7 @@ struct typec_thunderbolt_data { /* TBT3 Device Enter Mode VDO bits */ #define TBT_ENTER_MODE_CABLE_SPEED(s) TBT_SET_CABLE_SPEED(s) +#define TBT_ENTER_MODE_UNI_DIR_LSRX BIT(23) #define TBT_ENTER_MODE_ACTIVE_CABLE BIT(24) #endif /* __USB_TYPEC_TBT_H */ -- cgit v1.2.3 From db82b8dbf5f06d7b1abec4e1326ed8c02fa16897 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 20 Oct 2025 17:03:28 +0200 Subject: PM: runtime: Fix conditional guard definitions Since pm_runtime_get_active() returns 0 on success, all of the DEFINE_GUARD_COND() macros in pm_runtime.h need the "_RET == 0" condition at the end of the argument list or they would not work correctly. Fixes: 9a0abc39450a ("PM: runtime: Add auto-cleanup macros for "resume and get" operations") Reported-by: kernel test robot Link: https://lore.kernel.org/linux-pm/202510191529.BCyjKlLQ-lkp@intel.com/ Signed-off-by: Rafael J. Wysocki Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Tested-by: Farhan Ali Link: https://patch.msgid.link/5943878.DvuYhMxLoT@rafael.j.wysocki --- include/linux/pm_runtime.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index a3f44f6c2da1..0b436e15f4cd 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -629,13 +629,13 @@ DEFINE_GUARD(pm_runtime_active_auto, struct device *, * device. */ DEFINE_GUARD_COND(pm_runtime_active, _try, - pm_runtime_get_active(_T, RPM_TRANSPARENT)) + pm_runtime_get_active(_T, RPM_TRANSPARENT), _RET == 0) DEFINE_GUARD_COND(pm_runtime_active, _try_enabled, - pm_runtime_resume_and_get(_T)) + pm_runtime_resume_and_get(_T), _RET == 0) DEFINE_GUARD_COND(pm_runtime_active_auto, _try, - pm_runtime_get_active(_T, RPM_TRANSPARENT)) + pm_runtime_get_active(_T, RPM_TRANSPARENT), _RET == 0) DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, - pm_runtime_resume_and_get(_T)) + pm_runtime_resume_and_get(_T), _RET == 0) /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. -- cgit v1.2.3 From 9025688bf6d427e553aca911308cd92e92634f51 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 20 Oct 2025 10:53:40 -0700 Subject: module: Fix device table module aliases Commit 6717e8f91db7 ("kbuild: Remove 'kmod_' prefix from __KBUILD_MODNAME") inadvertently broke module alias generation for modules which rely on MODULE_DEVICE_TABLE(). It removed the "kmod_" prefix from __KBUILD_MODNAME, which caused MODULE_DEVICE_TABLE() to generate a symbol name which no longer matched the format expected by handle_moddevtable() in scripts/mod/file2alias.c. As a result, modpost failed to find the device tables, leading to missing module aliases. Fix this by explicitly adding the "kmod_" string within the MODULE_DEVICE_TABLE() macro itself, restoring the symbol name to the format expected by file2alias.c. Fixes: 6717e8f91db7 ("kbuild: Remove 'kmod_' prefix from __KBUILD_MODNAME") Reported-by: Alexander Stein Reported-by: Marek Szyprowski Reported-by: Mark Brown Reported-by: Cosmin Tanislav Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Tested-by: Cosmin Tanislav Tested-by: Marek Szyprowski Tested-by: Mark Brown Tested-by: Alexander Stein Tested-by: Chen-Yu Tsai Tested-by: Anders Roxell Link: https://patch.msgid.link/e52ee3edf32874da645a9e037a7d77c69893a22a.1760982784.git.jpoimboe@kernel.org --- include/linux/module.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index e135cc79acee..d80c3ea57472 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -251,10 +251,11 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name); */ #define __mod_device_table(type, name) \ __PASTE(__mod_device_table__, \ + __PASTE(kmod_, \ __PASTE(__KBUILD_MODNAME, \ __PASTE(__, \ __PASTE(type, \ - __PASTE(__, name))))) + __PASTE(__, name)))))) /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ -- cgit v1.2.3 From 5c5028ee594ce5f907ca6ad1c32cca6a15098464 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 20 Oct 2025 13:47:15 -0700 Subject: block: rename min_segment_size Despite its name, the block layer is fine with segments smaller that the "min_segment_size" limit. The value is an optimization limit indicating the largest segment that can be used without considering boundary limits. Smaller segments can take a fast path, so give it a name that reflects that: max_fast_segment_size. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- block/blk-settings.c | 4 ++-- block/blk.h | 2 +- include/linux/blkdev.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index 37864c5d287e..c47d18587a0b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -336,7 +336,7 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, if (nsegs < lim->max_segments && bytes + bv.bv_len <= max_bytes && - bv.bv_offset + bv.bv_len <= lim->min_segment_size) { + bv.bv_offset + bv.bv_len <= lim->max_fast_segment_size) { nsegs++; bytes += bv.bv_len; } else { diff --git a/block/blk-settings.c b/block/blk-settings.c index 54cffaae4df4..345b6a271cc3 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -457,12 +457,12 @@ int blk_validate_limits(struct queue_limits *lim) return -EINVAL; } - /* setup min segment size for building new segment in fast path */ + /* setup max segment size for building new segment in fast path */ if (lim->seg_boundary_mask > lim->max_segment_size - 1) seg_size = lim->max_segment_size; else seg_size = lim->seg_boundary_mask + 1; - lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); + lim->max_fast_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); /* * We require drivers to at least do logical block aligned I/O, but diff --git a/block/blk.h b/block/blk.h index 170794632135..32a10024efba 100644 --- a/block/blk.h +++ b/block/blk.h @@ -377,7 +377,7 @@ static inline bool bio_may_need_split(struct bio *bio, if (bio->bi_vcnt != 1) return true; return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > - lim->min_segment_size; + lim->max_fast_segment_size; } /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 70b671a9a7f7..99be263b31ab 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -378,7 +378,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; - unsigned int min_segment_size; + unsigned int max_fast_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; -- cgit v1.2.3 From 159e85110891ebc12500d02d4bf214b1d203e305 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Oct 2025 13:43:18 +0300 Subject: ACPI: property: Make acpi_get_next_subnode() static acpi_get_next_subnode() is only used in drivers/acpi/property.c. Remove its prototype from include/linux/acpi.h and make it static. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Reviewed-by: Laurent Pinchart Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251001104320.1272752-2-sakari.ailus@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 7 ++++--- include/linux/acpi.h | 10 ---------- 2 files changed, 4 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 43d5e457814e..dbf86bee62e1 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -1329,13 +1329,14 @@ static int stop_on_next(struct acpi_device *adev, void *data) return 0; } -/** +/* * acpi_get_next_subnode - Return the next child node handle for a fwnode * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the device's child nodes or a null handle. */ -struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, - struct fwnode_handle *child) +static struct fwnode_handle * +acpi_get_next_subnode(const struct fwnode_handle *fwnode, + struct fwnode_handle *child) { struct acpi_device *adev = to_acpi_device_node(fwnode); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead..703323b9fe0c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1349,9 +1349,6 @@ acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr); -struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, - struct fwnode_handle *child); - struct acpi_probe_entry; typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *, struct acpi_probe_entry *); @@ -1450,13 +1447,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode, return -ENXIO; } -static inline struct fwnode_handle * -acpi_get_next_subnode(const struct fwnode_handle *fwnode, - struct fwnode_handle *child) -{ - return NULL; -} - static inline struct fwnode_handle * acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) -- cgit v1.2.3 From 113cbd62824afdf62d2f3f092809cf37cc7f1dd8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:07 +0200 Subject: blktrace: pass blk_user_trace2 to setup functions Pass struct blk_user_trace_setup2 to blktrace_setup_finalize(). This prepares for the incoming extension of the blktrace protocol with a 64bit act_mask. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 3 ++- kernel/trace/blktrace.c | 31 ++++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 122c62e561fc..05c8754456aa 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -14,11 +14,12 @@ #include struct blk_trace { + int version; int trace_state; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; - u16 act_mask; + u64 act_mask; u64 start_lba; u64 end_lba; u32 pid; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c31b8f433116..d1532df84cc8 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -597,11 +597,12 @@ err: } static void blk_trace_setup_finalize(struct request_queue *q, - char *name, struct blk_trace *bt, - struct blk_user_trace_setup *buts) + char *name, int version, + struct blk_trace *bt, + struct blk_user_trace_setup2 *buts) { - strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2); /* * some device names have larger paths - convert the slashes @@ -609,6 +610,7 @@ static void blk_trace_setup_finalize(struct request_queue *q, */ strreplace(buts->name, '/', '_'); + bt->version = version; bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; @@ -630,6 +632,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { + struct blk_user_trace_setup2 buts2; struct blk_user_trace_setup buts; struct blk_trace *bt; int ret; @@ -641,6 +644,15 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts.buf_size || !buts.buf_nr) return -EINVAL; + buts2 = (struct blk_user_trace_setup2) { + .act_mask = buts.act_mask, + .buf_size = buts.buf_size, + .buf_nr = buts.buf_nr, + .start_lba = buts.start_lba, + .end_lba = buts.end_lba, + .pid = buts.pid, + }; + mutex_lock(&q->debugfs_mutex); bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, bdev); @@ -648,7 +660,8 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, mutex_unlock(&q->debugfs_mutex); return PTR_ERR(bt); } - blk_trace_setup_finalize(q, name, bt, &buts); + blk_trace_setup_finalize(q, name, 1, bt, &buts2); + strcpy(buts.name, buts2.name); mutex_unlock(&q->debugfs_mutex); if (copy_to_user(arg, &buts, sizeof(buts))) { @@ -664,7 +677,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { - struct blk_user_trace_setup buts; + struct blk_user_trace_setup2 buts2; struct compat_blk_user_trace_setup cbuts; struct blk_trace *bt; @@ -674,7 +687,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, if (!cbuts.buf_size || !cbuts.buf_nr) return -EINVAL; - buts = (struct blk_user_trace_setup) { + buts2 = (struct blk_user_trace_setup2) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, .buf_nr = cbuts.buf_nr, @@ -684,16 +697,16 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, }; mutex_lock(&q->debugfs_mutex); - bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, bdev); if (IS_ERR(bt)) { mutex_unlock(&q->debugfs_mutex); return PTR_ERR(bt); } - blk_trace_setup_finalize(q, name, bt, &buts); + blk_trace_setup_finalize(q, name, 1, bt, &buts2); mutex_unlock(&q->debugfs_mutex); - if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { + if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) { blk_trace_remove(q); return -EFAULT; } -- cgit v1.2.3 From cbe5aeedecc72314c3a8fd0d41d9b270f576aee1 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:05 +0900 Subject: PM: EM: Assign a unique ID when creating a performance domain It is necessary to refer to a specific performance domain from a userspace. For example, the energy model of a particular performance domain is updated. To this end, assign a unique ID to each performance domain to address it, and manage them in a global linked list to look up a specific one by matching ID. IDA is used for ID assignment, and the mutex is used to protect the global list from concurrent access. Note that the mutex (em_pd_list_mutex) is not supposed to hold while holding em_pd_mutex to avoid ABBA deadlock. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-2-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 4 ++++ kernel/power/energy_model.c | 30 +++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 61d50571ad88..43aa6153dc57 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -54,6 +54,8 @@ struct em_perf_table { /** * struct em_perf_domain - Performance domain * @em_table: Pointer to the runtime modifiable em_perf_table + * @node: node in em_pd_list (in energy_model.c) + * @id: A unique ID number for each performance domain * @nr_perf_states: Number of performance states * @min_perf_state: Minimum allowed Performance State index * @max_perf_state: Maximum allowed Performance State index @@ -71,6 +73,8 @@ struct em_perf_table { */ struct em_perf_domain { struct em_perf_table __rcu *em_table; + struct list_head node; + int id; int nr_perf_states; int min_perf_state; int max_perf_state; diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 5f17d2e8e954..2047b546ad11 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -23,6 +23,16 @@ */ static DEFINE_MUTEX(em_pd_mutex); +/* + * Manage performance domains with IDs. One can iterate the performance domains + * through the list and pick one with their associated ID. The mutex serializes + * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be + * taken to avoid potential deadlock. + */ +static DEFINE_IDA(em_pd_ida); +static LIST_HEAD(em_pd_list); +static DEFINE_MUTEX(em_pd_list_mutex); + static void em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table); static void em_check_capacity_update(void); @@ -396,7 +406,7 @@ static int em_create_pd(struct device *dev, int nr_states, struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; - int cpu, ret, num_cpus; + int cpu, ret, num_cpus, id; if (_is_cpu_device(dev)) { num_cpus = cpumask_weight(cpus); @@ -420,6 +430,13 @@ static int em_create_pd(struct device *dev, int nr_states, pd->nr_perf_states = nr_states; + INIT_LIST_HEAD(&pd->node); + + id = ida_alloc(&em_pd_ida, GFP_KERNEL); + if (id < 0) + return -ENOMEM; + pd->id = id; + em_table = em_table_alloc(pd); if (!em_table) goto free_pd; @@ -444,6 +461,7 @@ free_pd_table: kfree(em_table); free_pd: kfree(pd); + ida_free(&em_pd_ida, id); return -EINVAL; } @@ -660,6 +678,10 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); + mutex_lock(&em_pd_list_mutex); + list_add_tail(&dev->em_pd->node, &em_pd_list); + mutex_unlock(&em_pd_list_mutex); + return ret; } EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); @@ -678,6 +700,10 @@ void em_dev_unregister_perf_domain(struct device *dev) if (_is_cpu_device(dev)) return; + mutex_lock(&em_pd_list_mutex); + list_del_init(&dev->em_pd->node); + mutex_unlock(&em_pd_list_mutex); + /* * The mutex separates all register/unregister requests and protects * from potential clean-up/setup issues in the debugfs directories. @@ -689,6 +715,8 @@ void em_dev_unregister_perf_domain(struct device *dev) em_table_free(rcu_dereference_protected(dev->em_pd->em_table, lockdep_is_held(&em_pd_mutex))); + ida_free(&em_pd_ida, dev->em_pd->id); + kfree(dev->em_pd); dev->em_pd = NULL; mutex_unlock(&em_pd_mutex); -- cgit v1.2.3 From c88b6ee3ba3c7bf6386ea0e6de8111acc3d832bc Mon Sep 17 00:00:00 2001 From: Jingyi Wang Date: Wed, 24 Sep 2025 16:24:55 -0700 Subject: soc: qcom: llcc-qcom: Add support for Kaanapali Add system cache table and configs for Kaanapali SoC. Signed-off-by: Jingyi Wang Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20250924-knp-llcc-v1-2-ae6a016e5138@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/llcc-qcom.c | 373 +++++++++++++++++++++++++++++++++++++ include/linux/soc/qcom/llcc-qcom.h | 7 + 2 files changed, 380 insertions(+) (limited to 'include/linux') diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c index 857ead56b37d..13e174267294 100644 --- a/drivers/soc/qcom/llcc-qcom.c +++ b/drivers/soc/qcom/llcc-qcom.c @@ -214,6 +214,364 @@ static const struct llcc_slice_config ipq5424_data[] = { }, }; +static const struct llcc_slice_config kaanapali_data[] = { + { + .usecase_id = LLCC_CPUSS, + .slice_id = 1, + .max_cap = 5120, + .priority = 1, + .bonus_ways = 0xffffffff, + .activate_on_init = true, + .write_scid_en = true, + .stale_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_VIDSC0, + .slice_id = 2, + .max_cap = 512, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_AUDIO, + .slice_id = 35, + .max_cap = 512, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_MDMHPGRW, + .slice_id = 25, + .max_cap = 1024, + .priority = 5, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CMPT, + .slice_id = 34, + .max_cap = 4096, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_GPUHTW, + .slice_id = 11, + .max_cap = 512, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_GPU, + .slice_id = 9, + .max_cap = 5632, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .write_scid_cacheable_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_MMUHWT, + .slice_id = 18, + .max_cap = 768, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .activate_on_init = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_DISP, + .slice_id = 16, + .max_cap = 7168, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .cache_mode = 2, + .stale_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_MDMHPFX, + .slice_id = 24, + .max_cap = 1024, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_MDMPNG, + .slice_id = 27, + .max_cap = 256, + .priority = 5, + .bonus_ways = 0xfffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CVP, + .slice_id = 8, + .max_cap = 800, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_MODPE, + .slice_id = 29, + .max_cap = 256, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xf0000000, + .mru_uncap_en = true, + .alloc_oneway_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_WRCACHE, + .slice_id = 31, + .max_cap = 512, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .activate_on_init = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CVPFW, + .slice_id = 19, + .max_cap = 512, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CPUMTE, + .slice_id = 7, + .max_cap = 256, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CMPTHCP, + .slice_id = 15, + .max_cap = 256, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_LCPDARE, + .slice_id = 30, + .max_cap = 128, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .activate_on_init = true, + .mru_uncap_en = true, + .alloc_oneway_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_AENPU, + .slice_id = 3, + .max_cap = 3072, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .cache_mode = 2, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_ISLAND1, + .slice_id = 12, + .max_cap = 7936, + .priority = 7, + .fixed_size = true, + .bonus_ways = 0x7fffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_DISP_WB, + .slice_id = 23, + .max_cap = 512, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_VIDVSP, + .slice_id = 4, + .max_cap = 256, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_VIDDEC, + .slice_id = 5, + .max_cap = 512, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .cache_mode = 2, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CAMOFE, + .slice_id = 33, + .max_cap = 6144, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .stale_en = true, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CAMRTIP, + .slice_id = 13, + .max_cap = 6144, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .stale_en = true, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CAMRTRF, + .slice_id = 10, + .max_cap = 3584, + .priority = 3, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .stale_en = true, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CAMSRTRF, + .slice_id = 21, + .max_cap = 6144, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .stale_en = true, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_VIDEO_APV, + .slice_id = 6, + .max_cap = 768, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_COMPUTE1, + .slice_id = 22, + .max_cap = 4096, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CPUSS_OPP, + .slice_id = 32, + .max_cap = 0, + .priority = 0, + .fixed_size = true, + .bonus_ways = 0, + .activate_on_init = true, + .write_scid_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CPUSSMPAM, + .slice_id = 17, + .max_cap = 2048, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .activate_on_init = true, + .write_scid_en = true, + .stale_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CAM_IPE_STROV, + .slice_id = 14, + .max_cap = 400, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CAM_OFE_STROV, + .slice_id = 20, + .max_cap = 400, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CPUSS_HEU, + .slice_id = 28, + .max_cap = 0, + .priority = 0, + .fixed_size = true, + .bonus_ways = 0, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_MDM_PNG_FIXED, + .slice_id = 26, + .max_cap = 256, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xff000000, + .activate_on_init = true, + .write_scid_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, +}; + static const struct llcc_slice_config sa8775p_data[] = { { .usecase_id = LLCC_CPUSS, @@ -3505,6 +3863,15 @@ static const u32 llcc_v6_reg_offset[] = { [LLCC_TRP_WRS_CACHEABLE_EN] = 0x00042088, }; +static const struct qcom_llcc_config kaanapali_cfg[] = { + { + .sct_data = kaanapali_data, + .size = ARRAY_SIZE(kaanapali_data), + .reg_offset = llcc_v6_reg_offset, + .edac_reg_offset = &llcc_v6_edac_reg_offset, + }, +}; + static const struct qcom_llcc_config qcs615_cfg[] = { { .sct_data = qcs615_data, @@ -3731,6 +4098,11 @@ static const struct qcom_llcc_config x1e80100_cfg[] = { }, }; +static const struct qcom_sct_config kaanapali_cfgs = { + .llcc_config = kaanapali_cfg, + .num_config = ARRAY_SIZE(kaanapali_cfg), +}; + static const struct qcom_sct_config qcs615_cfgs = { .llcc_config = qcs615_cfg, .num_config = ARRAY_SIZE(qcs615_cfg), @@ -4570,6 +4942,7 @@ err: static const struct of_device_id qcom_llcc_of_match[] = { { .compatible = "qcom,ipq5424-llcc", .data = &ipq5424_cfgs}, + { .compatible = "qcom,kaanapali-llcc", .data = &kaanapali_cfgs}, { .compatible = "qcom,qcs615-llcc", .data = &qcs615_cfgs}, { .compatible = "qcom,qcs8300-llcc", .data = &qcs8300_cfgs}, { .compatible = "qcom,qdu1000-llcc", .data = &qdu1000_cfgs}, diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 7a69210a250c..0287f9182c4d 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -74,7 +74,14 @@ #define LLCC_CAMSRTIP 73 #define LLCC_CAMRTRF 74 #define LLCC_CAMSRTRF 75 +#define LLCC_VIDEO_APV 83 +#define LLCC_COMPUTE1 87 +#define LLCC_CPUSS_OPP 88 #define LLCC_CPUSSMPAM 89 +#define LLCC_CAM_IPE_STROV 92 +#define LLCC_CAM_OFE_STROV 93 +#define LLCC_CPUSS_HEU 94 +#define LLCC_MDM_PNG_FIXED 100 /** * struct llcc_slice_desc - Cache slice descriptor -- cgit v1.2.3 From 67a4b6a89b99aff0883114e4ecba4b11aedc29a5 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 6 Feb 2025 16:44:10 -0500 Subject: lsm: split the init code out into lsm_init.c Continue to pull code out of security/security.c to help improve readability by pulling all of the LSM framework initialization code out into a new file. No code changes. Reviewed-by: Kees Cook Reviewed-by: John Johansen Reviewed-by: Casey Schaufler Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 3 +- security/Makefile | 2 +- security/lsm.h | 22 ++ security/lsm_init.c | 543 +++++++++++++++++++++++++++++++++++++++++ security/security.c | 597 +++------------------------------------------- 5 files changed, 601 insertions(+), 566 deletions(-) create mode 100644 security/lsm.h create mode 100644 security/lsm_init.c (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 79ec5a2bdcca..0112926ed923 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -170,11 +170,10 @@ struct lsm_info { __used __section(".early_lsm_info.init") \ __aligned(sizeof(unsigned long)) + /* DO NOT tamper with these variables outside of the LSM framework */ extern char *lsm_names; extern struct lsm_static_calls_table static_calls_table __ro_after_init; -extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; -extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; /** * lsm_get_xattr_slot - Return the next available slot and increment the index diff --git a/security/Makefile b/security/Makefile index 14d87847bce8..4601230ba442 100644 --- a/security/Makefile +++ b/security/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_SECURITY) += lsm_syscalls.o obj-$(CONFIG_MMU) += min_addr.o # Object file lists -obj-$(CONFIG_SECURITY) += security.o lsm_notifier.o +obj-$(CONFIG_SECURITY) += security.o lsm_notifier.o lsm_init.o obj-$(CONFIG_SECURITYFS) += inode.o obj-$(CONFIG_SECURITY_SELINUX) += selinux/ obj-$(CONFIG_SECURITY_SMACK) += smack/ diff --git a/security/lsm.h b/security/lsm.h new file mode 100644 index 000000000000..0e1731bad4a7 --- /dev/null +++ b/security/lsm.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * LSM functions + */ + +#ifndef _LSM_H_ +#define _LSM_H_ + +#include + +/* LSM blob configuration */ +extern struct lsm_blob_sizes blob_sizes; + +/* LSM blob caches */ +extern struct kmem_cache *lsm_file_cache; +extern struct kmem_cache *lsm_inode_cache; + +/* LSM blob allocators */ +int lsm_cred_alloc(struct cred *cred, gfp_t gfp); +int lsm_task_alloc(struct task_struct *task); + +#endif /* _LSM_H_ */ diff --git a/security/lsm_init.c b/security/lsm_init.c new file mode 100644 index 000000000000..124213b906af --- /dev/null +++ b/security/lsm_init.c @@ -0,0 +1,543 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * LSM initialization functions + */ + +#define pr_fmt(fmt) "LSM: " fmt + +#include +#include + +#include "lsm.h" + +char *lsm_names; + +/* Pointers to LSM sections defined in include/asm-generic/vmlinux.lds.h */ +extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; +extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; + +/* Boot-time LSM user choice */ +static __initconst const char *const builtin_lsm_order = CONFIG_LSM; +static __initdata const char *chosen_lsm_order; +static __initdata const char *chosen_major_lsm; + +/* Ordered list of LSMs to initialize. */ +static __initdata struct lsm_info *ordered_lsms[MAX_LSM_COUNT + 1]; +static __initdata struct lsm_info *exclusive; + +static __initdata bool debug; +#define init_debug(...) \ + do { \ + if (debug) \ + pr_info(__VA_ARGS__); \ + } while (0) + +static int lsm_append(const char *new, char **result); + +/* Save user chosen LSM */ +static int __init choose_major_lsm(char *str) +{ + chosen_major_lsm = str; + return 1; +} +__setup("security=", choose_major_lsm); + +/* Explicitly choose LSM initialization order. */ +static int __init choose_lsm_order(char *str) +{ + chosen_lsm_order = str; + return 1; +} +__setup("lsm=", choose_lsm_order); + +/* Enable LSM order debugging. */ +static int __init enable_debug(char *str) +{ + debug = true; + return 1; +} +__setup("lsm.debug", enable_debug); + +/* Mark an LSM's enabled flag. */ +static int lsm_enabled_true __initdata = 1; +static int lsm_enabled_false __initdata = 0; +static void __init set_enabled(struct lsm_info *lsm, bool enabled) +{ + /* + * When an LSM hasn't configured an enable variable, we can use + * a hard-coded location for storing the default enabled state. + */ + if (!lsm->enabled) { + if (enabled) + lsm->enabled = &lsm_enabled_true; + else + lsm->enabled = &lsm_enabled_false; + } else if (lsm->enabled == &lsm_enabled_true) { + if (!enabled) + lsm->enabled = &lsm_enabled_false; + } else if (lsm->enabled == &lsm_enabled_false) { + if (enabled) + lsm->enabled = &lsm_enabled_true; + } else { + *lsm->enabled = enabled; + } +} + +static inline bool is_enabled(struct lsm_info *lsm) +{ + if (!lsm->enabled) + return false; + + return *lsm->enabled; +} + +/* Is an LSM already listed in the ordered LSMs list? */ +static bool __init exists_ordered_lsm(struct lsm_info *lsm) +{ + struct lsm_info **check; + + for (check = ordered_lsms; *check; check++) + if (*check == lsm) + return true; + + return false; +} + +/* Append an LSM to the list of ordered LSMs to initialize. */ +static int last_lsm __initdata; +static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from) +{ + /* Ignore duplicate selections. */ + if (exists_ordered_lsm(lsm)) + return; + + if (WARN(last_lsm == MAX_LSM_COUNT, "%s: out of LSM static calls!?\n", from)) + return; + + /* Enable this LSM, if it is not already set. */ + if (!lsm->enabled) + lsm->enabled = &lsm_enabled_true; + ordered_lsms[last_lsm++] = lsm; + + init_debug("%s ordered: %s (%s)\n", from, lsm->name, + is_enabled(lsm) ? "enabled" : "disabled"); +} + +/* Is an LSM allowed to be initialized? */ +static bool __init lsm_allowed(struct lsm_info *lsm) +{ + /* Skip if the LSM is disabled. */ + if (!is_enabled(lsm)) + return false; + + /* Not allowed if another exclusive LSM already initialized. */ + if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) { + init_debug("exclusive disabled: %s\n", lsm->name); + return false; + } + + return true; +} + +static void __init lsm_set_blob_size(int *need, int *lbs) +{ + int offset; + + if (*need <= 0) + return; + + offset = ALIGN(*lbs, sizeof(void *)); + *lbs = offset + *need; + *need = offset; +} + +static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) +{ + if (!needed) + return; + + lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred); + lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file); + lsm_set_blob_size(&needed->lbs_ib, &blob_sizes.lbs_ib); + /* + * The inode blob gets an rcu_head in addition to + * what the modules might need. + */ + if (needed->lbs_inode && blob_sizes.lbs_inode == 0) + blob_sizes.lbs_inode = sizeof(struct rcu_head); + lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode); + lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc); + lsm_set_blob_size(&needed->lbs_key, &blob_sizes.lbs_key); + lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg); + lsm_set_blob_size(&needed->lbs_perf_event, &blob_sizes.lbs_perf_event); + lsm_set_blob_size(&needed->lbs_sock, &blob_sizes.lbs_sock); + lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock); + lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task); + lsm_set_blob_size(&needed->lbs_tun_dev, &blob_sizes.lbs_tun_dev); + lsm_set_blob_size(&needed->lbs_xattr_count, + &blob_sizes.lbs_xattr_count); + lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev); + lsm_set_blob_size(&needed->lbs_bpf_map, &blob_sizes.lbs_bpf_map); + lsm_set_blob_size(&needed->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog); + lsm_set_blob_size(&needed->lbs_bpf_token, &blob_sizes.lbs_bpf_token); +} + +/* Prepare LSM for initialization. */ +static void __init prepare_lsm(struct lsm_info *lsm) +{ + int enabled = lsm_allowed(lsm); + + /* Record enablement (to handle any following exclusive LSMs). */ + set_enabled(lsm, enabled); + + /* If enabled, do pre-initialization work. */ + if (enabled) { + if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) { + exclusive = lsm; + init_debug("exclusive chosen: %s\n", lsm->name); + } + + lsm_set_blob_sizes(lsm->blobs); + } +} + +/* Initialize a given LSM, if it is enabled. */ +static void __init initialize_lsm(struct lsm_info *lsm) +{ + if (is_enabled(lsm)) { + int ret; + + init_debug("initializing %s\n", lsm->name); + ret = lsm->init(); + WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret); + } +} + +/* + * Current index to use while initializing the lsm id list. + */ +u32 lsm_active_cnt __ro_after_init; +const struct lsm_id *lsm_idlist[MAX_LSM_COUNT]; + +/* Populate ordered LSMs list from comma-separated LSM name list. */ +static void __init ordered_lsm_parse(const char *order, const char *origin) +{ + struct lsm_info *lsm; + char *sep, *name, *next; + + /* LSM_ORDER_FIRST is always first. */ + for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { + if (lsm->order == LSM_ORDER_FIRST) + append_ordered_lsm(lsm, " first"); + } + + /* Process "security=", if given. */ + if (chosen_major_lsm) { + struct lsm_info *major; + + /* + * To match the original "security=" behavior, this + * explicitly does NOT fallback to another Legacy Major + * if the selected one was separately disabled: disable + * all non-matching Legacy Major LSMs. + */ + for (major = __start_lsm_info; major < __end_lsm_info; + major++) { + if ((major->flags & LSM_FLAG_LEGACY_MAJOR) && + strcmp(major->name, chosen_major_lsm) != 0) { + set_enabled(major, false); + init_debug("security=%s disabled: %s (only one legacy major LSM)\n", + chosen_major_lsm, major->name); + } + } + } + + sep = kstrdup(order, GFP_KERNEL); + next = sep; + /* Walk the list, looking for matching LSMs. */ + while ((name = strsep(&next, ",")) != NULL) { + bool found = false; + + for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { + if (strcmp(lsm->name, name) == 0) { + if (lsm->order == LSM_ORDER_MUTABLE) + append_ordered_lsm(lsm, origin); + found = true; + } + } + + if (!found) + init_debug("%s ignored: %s (not built into kernel)\n", + origin, name); + } + + /* Process "security=", if given. */ + if (chosen_major_lsm) { + for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { + if (exists_ordered_lsm(lsm)) + continue; + if (strcmp(lsm->name, chosen_major_lsm) == 0) + append_ordered_lsm(lsm, "security="); + } + } + + /* LSM_ORDER_LAST is always last. */ + for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { + if (lsm->order == LSM_ORDER_LAST) + append_ordered_lsm(lsm, " last"); + } + + /* Disable all LSMs not in the ordered list. */ + for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { + if (exists_ordered_lsm(lsm)) + continue; + set_enabled(lsm, false); + init_debug("%s skipped: %s (not in requested order)\n", + origin, lsm->name); + } + + kfree(sep); +} + +static void __init report_lsm_order(void) +{ + struct lsm_info **lsm, *early; + int first = 0; + + pr_info("initializing lsm="); + + /* Report each enabled LSM name, comma separated. */ + for (early = __start_early_lsm_info; + early < __end_early_lsm_info; early++) + if (is_enabled(early)) + pr_cont("%s%s", first++ == 0 ? "" : ",", early->name); + for (lsm = ordered_lsms; *lsm; lsm++) + if (is_enabled(*lsm)) + pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name); + + pr_cont("\n"); +} + +/** + * lsm_early_cred - during initialization allocate a composite cred blob + * @cred: the cred that needs a blob + * + * Allocate the cred blob for all the modules + */ +static void __init lsm_early_cred(struct cred *cred) +{ + int rc = lsm_cred_alloc(cred, GFP_KERNEL); + + if (rc) + panic("%s: Early cred alloc failed.\n", __func__); +} + +/** + * lsm_early_task - during initialization allocate a composite task blob + * @task: the task that needs a blob + * + * Allocate the task blob for all the modules + */ +static void __init lsm_early_task(struct task_struct *task) +{ + int rc = lsm_task_alloc(task); + + if (rc) + panic("%s: Early task alloc failed.\n", __func__); +} + +static void __init ordered_lsm_init(void) +{ + struct lsm_info **lsm; + + if (chosen_lsm_order) { + if (chosen_major_lsm) { + pr_warn("security=%s is ignored because it is superseded by lsm=%s\n", + chosen_major_lsm, chosen_lsm_order); + chosen_major_lsm = NULL; + } + ordered_lsm_parse(chosen_lsm_order, "cmdline"); + } else + ordered_lsm_parse(builtin_lsm_order, "builtin"); + + for (lsm = ordered_lsms; *lsm; lsm++) + prepare_lsm(*lsm); + + report_lsm_order(); + + init_debug("cred blob size = %d\n", blob_sizes.lbs_cred); + init_debug("file blob size = %d\n", blob_sizes.lbs_file); + init_debug("ib blob size = %d\n", blob_sizes.lbs_ib); + init_debug("inode blob size = %d\n", blob_sizes.lbs_inode); + init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc); +#ifdef CONFIG_KEYS + init_debug("key blob size = %d\n", blob_sizes.lbs_key); +#endif /* CONFIG_KEYS */ + init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg); + init_debug("sock blob size = %d\n", blob_sizes.lbs_sock); + init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock); + init_debug("perf event blob size = %d\n", blob_sizes.lbs_perf_event); + init_debug("task blob size = %d\n", blob_sizes.lbs_task); + init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev); + init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); + init_debug("bdev blob size = %d\n", blob_sizes.lbs_bdev); + init_debug("bpf map blob size = %d\n", blob_sizes.lbs_bpf_map); + init_debug("bpf prog blob size = %d\n", blob_sizes.lbs_bpf_prog); + init_debug("bpf token blob size = %d\n", blob_sizes.lbs_bpf_token); + + /* + * Create any kmem_caches needed for blobs + */ + if (blob_sizes.lbs_file) + lsm_file_cache = kmem_cache_create("lsm_file_cache", + blob_sizes.lbs_file, 0, + SLAB_PANIC, NULL); + if (blob_sizes.lbs_inode) + lsm_inode_cache = kmem_cache_create("lsm_inode_cache", + blob_sizes.lbs_inode, 0, + SLAB_PANIC, NULL); + + lsm_early_cred((struct cred *) current->cred); + lsm_early_task(current); + for (lsm = ordered_lsms; *lsm; lsm++) + initialize_lsm(*lsm); +} + +static bool match_last_lsm(const char *list, const char *lsm) +{ + const char *last; + + if (WARN_ON(!list || !lsm)) + return false; + last = strrchr(list, ','); + if (last) + /* Pass the comma, strcmp() will check for '\0' */ + last++; + else + last = list; + return !strcmp(last, lsm); +} + +static int lsm_append(const char *new, char **result) +{ + char *cp; + + if (*result == NULL) { + *result = kstrdup(new, GFP_KERNEL); + if (*result == NULL) + return -ENOMEM; + } else { + /* Check if it is the last registered name */ + if (match_last_lsm(*result, new)) + return 0; + cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new); + if (cp == NULL) + return -ENOMEM; + kfree(*result); + *result = cp; + } + return 0; +} + +static void __init lsm_static_call_init(struct security_hook_list *hl) +{ + struct lsm_static_call *scall = hl->scalls; + int i; + + for (i = 0; i < MAX_LSM_COUNT; i++) { + /* Update the first static call that is not used yet */ + if (!scall->hl) { + __static_call_update(scall->key, scall->trampoline, + hl->hook.lsm_func_addr); + scall->hl = hl; + static_branch_enable(scall->active); + return; + } + scall++; + } + panic("%s - Ran out of static slots.\n", __func__); +} + +/** + * security_add_hooks - Add a modules hooks to the hook lists. + * @hooks: the hooks to add + * @count: the number of hooks to add + * @lsmid: the identification information for the security module + * + * Each LSM has to register its hooks with the infrastructure. + */ +void __init security_add_hooks(struct security_hook_list *hooks, int count, + const struct lsm_id *lsmid) +{ + int i; + + /* + * A security module may call security_add_hooks() more + * than once during initialization, and LSM initialization + * is serialized. Landlock is one such case. + * Look at the previous entry, if there is one, for duplication. + */ + if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) { + if (lsm_active_cnt >= MAX_LSM_COUNT) + panic("%s Too many LSMs registered.\n", __func__); + lsm_idlist[lsm_active_cnt++] = lsmid; + } + + for (i = 0; i < count; i++) { + hooks[i].lsmid = lsmid; + lsm_static_call_init(&hooks[i]); + } + + /* + * Don't try to append during early_security_init(), we'll come back + * and fix this up afterwards. + */ + if (slab_is_available()) { + if (lsm_append(lsmid->name, &lsm_names) < 0) + panic("%s - Cannot get early memory.\n", __func__); + } +} + +int __init early_security_init(void) +{ + struct lsm_info *lsm; + + for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { + if (!lsm->enabled) + lsm->enabled = &lsm_enabled_true; + prepare_lsm(lsm); + initialize_lsm(lsm); + } + + return 0; +} + +/** + * security_init - initializes the security framework + * + * This should be called early in the kernel initialization sequence. + */ +int __init security_init(void) +{ + struct lsm_info *lsm; + + init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*"); + init_debug(" CONFIG_LSM=%s\n", builtin_lsm_order); + init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*"); + + /* + * Append the names of the early LSM modules now that kmalloc() is + * available + */ + for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { + init_debug(" early started: %s (%s)\n", lsm->name, + is_enabled(lsm) ? "enabled" : "disabled"); + if (lsm->enabled) + lsm_append(lsm->name, &lsm_names); + } + + /* Load LSMs in specified order. */ + ordered_lsm_init(); + + return 0; +} diff --git a/security/security.c b/security/security.c index 667479c2e82f..dc9734f0d45c 100644 --- a/security/security.c +++ b/security/security.c @@ -32,24 +32,7 @@ #include #include -#define SECURITY_HOOK_ACTIVE_KEY(HOOK, IDX) security_hook_active_##HOOK##_##IDX - -/* - * Identifier for the LSM static calls. - * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h - * IDX is the index of the static call. 0 <= NUM < MAX_LSM_COUNT - */ -#define LSM_STATIC_CALL(HOOK, IDX) lsm_static_call_##HOOK##_##IDX - -/* - * Call the macro M for each LSM hook MAX_LSM_COUNT times. - */ -#define LSM_LOOP_UNROLL(M, ...) \ -do { \ - UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__) \ -} while (0) - -#define LSM_DEFINE_UNROLL(M, ...) UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__) +#include "lsm.h" /* * These are descriptions of the reasons that can be passed to the @@ -90,21 +73,29 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = { [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; -static struct kmem_cache *lsm_file_cache; -static struct kmem_cache *lsm_inode_cache; +struct lsm_blob_sizes blob_sizes; -char *lsm_names; -static struct lsm_blob_sizes blob_sizes __ro_after_init; +struct kmem_cache *lsm_file_cache; +struct kmem_cache *lsm_inode_cache; -/* Boot-time LSM user choice */ -static __initdata const char *chosen_lsm_order; -static __initdata const char *chosen_major_lsm; +#define SECURITY_HOOK_ACTIVE_KEY(HOOK, IDX) security_hook_active_##HOOK##_##IDX -static __initconst const char *const builtin_lsm_order = CONFIG_LSM; +/* + * Identifier for the LSM static calls. + * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h + * IDX is the index of the static call. 0 <= NUM < MAX_LSM_COUNT + */ +#define LSM_STATIC_CALL(HOOK, IDX) lsm_static_call_##HOOK##_##IDX + +/* + * Call the macro M for each LSM hook MAX_LSM_COUNT times. + */ +#define LSM_LOOP_UNROLL(M, ...) \ +do { \ + UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__) \ +} while (0) -/* Ordered list of LSMs to initialize. */ -static __initdata struct lsm_info *ordered_lsms[MAX_LSM_COUNT + 1]; -static __initdata struct lsm_info *exclusive; +#define LSM_DEFINE_UNROLL(M, ...) UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__) #ifdef CONFIG_HAVE_STATIC_CALL #define LSM_HOOK_TRAMP(NAME, NUM) \ @@ -155,496 +146,25 @@ struct lsm_static_calls_table #undef INIT_LSM_STATIC_CALL }; -static __initdata bool debug; -#define init_debug(...) \ - do { \ - if (debug) \ - pr_info(__VA_ARGS__); \ - } while (0) - -static bool __init is_enabled(struct lsm_info *lsm) -{ - if (!lsm->enabled) - return false; - - return *lsm->enabled; -} - -/* Mark an LSM's enabled flag. */ -static int lsm_enabled_true __initdata = 1; -static int lsm_enabled_false __initdata = 0; -static void __init set_enabled(struct lsm_info *lsm, bool enabled) -{ - /* - * When an LSM hasn't configured an enable variable, we can use - * a hard-coded location for storing the default enabled state. - */ - if (!lsm->enabled) { - if (enabled) - lsm->enabled = &lsm_enabled_true; - else - lsm->enabled = &lsm_enabled_false; - } else if (lsm->enabled == &lsm_enabled_true) { - if (!enabled) - lsm->enabled = &lsm_enabled_false; - } else if (lsm->enabled == &lsm_enabled_false) { - if (enabled) - lsm->enabled = &lsm_enabled_true; - } else { - *lsm->enabled = enabled; - } -} - -/* Is an LSM already listed in the ordered LSMs list? */ -static bool __init exists_ordered_lsm(struct lsm_info *lsm) -{ - struct lsm_info **check; - - for (check = ordered_lsms; *check; check++) - if (*check == lsm) - return true; - - return false; -} - -/* Append an LSM to the list of ordered LSMs to initialize. */ -static int last_lsm __initdata; -static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from) -{ - /* Ignore duplicate selections. */ - if (exists_ordered_lsm(lsm)) - return; - - if (WARN(last_lsm == MAX_LSM_COUNT, "%s: out of LSM static calls!?\n", from)) - return; - - /* Enable this LSM, if it is not already set. */ - if (!lsm->enabled) - lsm->enabled = &lsm_enabled_true; - ordered_lsms[last_lsm++] = lsm; - - init_debug("%s ordered: %s (%s)\n", from, lsm->name, - is_enabled(lsm) ? "enabled" : "disabled"); -} - -/* Is an LSM allowed to be initialized? */ -static bool __init lsm_allowed(struct lsm_info *lsm) -{ - /* Skip if the LSM is disabled. */ - if (!is_enabled(lsm)) - return false; - - /* Not allowed if another exclusive LSM already initialized. */ - if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) { - init_debug("exclusive disabled: %s\n", lsm->name); - return false; - } - - return true; -} - -static void __init lsm_set_blob_size(int *need, int *lbs) -{ - int offset; - - if (*need <= 0) - return; - - offset = ALIGN(*lbs, sizeof(void *)); - *lbs = offset + *need; - *need = offset; -} - -static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) -{ - if (!needed) - return; - - lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred); - lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file); - lsm_set_blob_size(&needed->lbs_ib, &blob_sizes.lbs_ib); - /* - * The inode blob gets an rcu_head in addition to - * what the modules might need. - */ - if (needed->lbs_inode && blob_sizes.lbs_inode == 0) - blob_sizes.lbs_inode = sizeof(struct rcu_head); - lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode); - lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc); - lsm_set_blob_size(&needed->lbs_key, &blob_sizes.lbs_key); - lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg); - lsm_set_blob_size(&needed->lbs_perf_event, &blob_sizes.lbs_perf_event); - lsm_set_blob_size(&needed->lbs_sock, &blob_sizes.lbs_sock); - lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock); - lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task); - lsm_set_blob_size(&needed->lbs_tun_dev, &blob_sizes.lbs_tun_dev); - lsm_set_blob_size(&needed->lbs_xattr_count, - &blob_sizes.lbs_xattr_count); - lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev); - lsm_set_blob_size(&needed->lbs_bpf_map, &blob_sizes.lbs_bpf_map); - lsm_set_blob_size(&needed->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog); - lsm_set_blob_size(&needed->lbs_bpf_token, &blob_sizes.lbs_bpf_token); -} - -/* Prepare LSM for initialization. */ -static void __init prepare_lsm(struct lsm_info *lsm) -{ - int enabled = lsm_allowed(lsm); - - /* Record enablement (to handle any following exclusive LSMs). */ - set_enabled(lsm, enabled); - - /* If enabled, do pre-initialization work. */ - if (enabled) { - if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) { - exclusive = lsm; - init_debug("exclusive chosen: %s\n", lsm->name); - } - - lsm_set_blob_sizes(lsm->blobs); - } -} - -/* Initialize a given LSM, if it is enabled. */ -static void __init initialize_lsm(struct lsm_info *lsm) -{ - if (is_enabled(lsm)) { - int ret; - - init_debug("initializing %s\n", lsm->name); - ret = lsm->init(); - WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret); - } -} - -/* - * Current index to use while initializing the lsm id list. - */ -u32 lsm_active_cnt __ro_after_init; -const struct lsm_id *lsm_idlist[MAX_LSM_COUNT]; - -/* Populate ordered LSMs list from comma-separated LSM name list. */ -static void __init ordered_lsm_parse(const char *order, const char *origin) -{ - struct lsm_info *lsm; - char *sep, *name, *next; - - /* LSM_ORDER_FIRST is always first. */ - for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { - if (lsm->order == LSM_ORDER_FIRST) - append_ordered_lsm(lsm, " first"); - } - - /* Process "security=", if given. */ - if (chosen_major_lsm) { - struct lsm_info *major; - - /* - * To match the original "security=" behavior, this - * explicitly does NOT fallback to another Legacy Major - * if the selected one was separately disabled: disable - * all non-matching Legacy Major LSMs. - */ - for (major = __start_lsm_info; major < __end_lsm_info; - major++) { - if ((major->flags & LSM_FLAG_LEGACY_MAJOR) && - strcmp(major->name, chosen_major_lsm) != 0) { - set_enabled(major, false); - init_debug("security=%s disabled: %s (only one legacy major LSM)\n", - chosen_major_lsm, major->name); - } - } - } - - sep = kstrdup(order, GFP_KERNEL); - next = sep; - /* Walk the list, looking for matching LSMs. */ - while ((name = strsep(&next, ",")) != NULL) { - bool found = false; - - for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { - if (strcmp(lsm->name, name) == 0) { - if (lsm->order == LSM_ORDER_MUTABLE) - append_ordered_lsm(lsm, origin); - found = true; - } - } - - if (!found) - init_debug("%s ignored: %s (not built into kernel)\n", - origin, name); - } - - /* Process "security=", if given. */ - if (chosen_major_lsm) { - for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { - if (exists_ordered_lsm(lsm)) - continue; - if (strcmp(lsm->name, chosen_major_lsm) == 0) - append_ordered_lsm(lsm, "security="); - } - } - - /* LSM_ORDER_LAST is always last. */ - for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { - if (lsm->order == LSM_ORDER_LAST) - append_ordered_lsm(lsm, " last"); - } - - /* Disable all LSMs not in the ordered list. */ - for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { - if (exists_ordered_lsm(lsm)) - continue; - set_enabled(lsm, false); - init_debug("%s skipped: %s (not in requested order)\n", - origin, lsm->name); - } - - kfree(sep); -} - -static void __init lsm_static_call_init(struct security_hook_list *hl) -{ - struct lsm_static_call *scall = hl->scalls; - int i; - - for (i = 0; i < MAX_LSM_COUNT; i++) { - /* Update the first static call that is not used yet */ - if (!scall->hl) { - __static_call_update(scall->key, scall->trampoline, - hl->hook.lsm_func_addr); - scall->hl = hl; - static_branch_enable(scall->active); - return; - } - scall++; - } - panic("%s - Ran out of static slots.\n", __func__); -} - -static void __init lsm_early_cred(struct cred *cred); -static void __init lsm_early_task(struct task_struct *task); - -static int lsm_append(const char *new, char **result); - -static void __init report_lsm_order(void) -{ - struct lsm_info **lsm, *early; - int first = 0; - - pr_info("initializing lsm="); - - /* Report each enabled LSM name, comma separated. */ - for (early = __start_early_lsm_info; - early < __end_early_lsm_info; early++) - if (is_enabled(early)) - pr_cont("%s%s", first++ == 0 ? "" : ",", early->name); - for (lsm = ordered_lsms; *lsm; lsm++) - if (is_enabled(*lsm)) - pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name); - - pr_cont("\n"); -} - -static void __init ordered_lsm_init(void) -{ - struct lsm_info **lsm; - - if (chosen_lsm_order) { - if (chosen_major_lsm) { - pr_warn("security=%s is ignored because it is superseded by lsm=%s\n", - chosen_major_lsm, chosen_lsm_order); - chosen_major_lsm = NULL; - } - ordered_lsm_parse(chosen_lsm_order, "cmdline"); - } else - ordered_lsm_parse(builtin_lsm_order, "builtin"); - - for (lsm = ordered_lsms; *lsm; lsm++) - prepare_lsm(*lsm); - - report_lsm_order(); - - init_debug("cred blob size = %d\n", blob_sizes.lbs_cred); - init_debug("file blob size = %d\n", blob_sizes.lbs_file); - init_debug("ib blob size = %d\n", blob_sizes.lbs_ib); - init_debug("inode blob size = %d\n", blob_sizes.lbs_inode); - init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc); -#ifdef CONFIG_KEYS - init_debug("key blob size = %d\n", blob_sizes.lbs_key); -#endif /* CONFIG_KEYS */ - init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg); - init_debug("sock blob size = %d\n", blob_sizes.lbs_sock); - init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock); - init_debug("perf event blob size = %d\n", blob_sizes.lbs_perf_event); - init_debug("task blob size = %d\n", blob_sizes.lbs_task); - init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev); - init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); - init_debug("bdev blob size = %d\n", blob_sizes.lbs_bdev); - init_debug("bpf map blob size = %d\n", blob_sizes.lbs_bpf_map); - init_debug("bpf prog blob size = %d\n", blob_sizes.lbs_bpf_prog); - init_debug("bpf token blob size = %d\n", blob_sizes.lbs_bpf_token); - - /* - * Create any kmem_caches needed for blobs - */ - if (blob_sizes.lbs_file) - lsm_file_cache = kmem_cache_create("lsm_file_cache", - blob_sizes.lbs_file, 0, - SLAB_PANIC, NULL); - if (blob_sizes.lbs_inode) - lsm_inode_cache = kmem_cache_create("lsm_inode_cache", - blob_sizes.lbs_inode, 0, - SLAB_PANIC, NULL); - - lsm_early_cred((struct cred *) current->cred); - lsm_early_task(current); - for (lsm = ordered_lsms; *lsm; lsm++) - initialize_lsm(*lsm); -} - -int __init early_security_init(void) -{ - struct lsm_info *lsm; - - for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { - if (!lsm->enabled) - lsm->enabled = &lsm_enabled_true; - prepare_lsm(lsm); - initialize_lsm(lsm); - } - - return 0; -} - /** - * security_init - initializes the security framework + * lsm_file_alloc - allocate a composite file blob + * @file: the file that needs a blob * - * This should be called early in the kernel initialization sequence. - */ -int __init security_init(void) -{ - struct lsm_info *lsm; - - init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*"); - init_debug(" CONFIG_LSM=%s\n", builtin_lsm_order); - init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*"); - - /* - * Append the names of the early LSM modules now that kmalloc() is - * available - */ - for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { - init_debug(" early started: %s (%s)\n", lsm->name, - is_enabled(lsm) ? "enabled" : "disabled"); - if (lsm->enabled) - lsm_append(lsm->name, &lsm_names); - } - - /* Load LSMs in specified order. */ - ordered_lsm_init(); - - return 0; -} - -/* Save user chosen LSM */ -static int __init choose_major_lsm(char *str) -{ - chosen_major_lsm = str; - return 1; -} -__setup("security=", choose_major_lsm); - -/* Explicitly choose LSM initialization order. */ -static int __init choose_lsm_order(char *str) -{ - chosen_lsm_order = str; - return 1; -} -__setup("lsm=", choose_lsm_order); - -/* Enable LSM order debugging. */ -static int __init enable_debug(char *str) -{ - debug = true; - return 1; -} -__setup("lsm.debug", enable_debug); - -static bool match_last_lsm(const char *list, const char *lsm) -{ - const char *last; - - if (WARN_ON(!list || !lsm)) - return false; - last = strrchr(list, ','); - if (last) - /* Pass the comma, strcmp() will check for '\0' */ - last++; - else - last = list; - return !strcmp(last, lsm); -} - -static int lsm_append(const char *new, char **result) -{ - char *cp; - - if (*result == NULL) { - *result = kstrdup(new, GFP_KERNEL); - if (*result == NULL) - return -ENOMEM; - } else { - /* Check if it is the last registered name */ - if (match_last_lsm(*result, new)) - return 0; - cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new); - if (cp == NULL) - return -ENOMEM; - kfree(*result); - *result = cp; - } - return 0; -} - -/** - * security_add_hooks - Add a modules hooks to the hook lists. - * @hooks: the hooks to add - * @count: the number of hooks to add - * @lsmid: the identification information for the security module + * Allocate the file blob for all the modules * - * Each LSM has to register its hooks with the infrastructure. + * Returns 0, or -ENOMEM if memory can't be allocated. */ -void __init security_add_hooks(struct security_hook_list *hooks, int count, - const struct lsm_id *lsmid) +static int lsm_file_alloc(struct file *file) { - int i; - - /* - * A security module may call security_add_hooks() more - * than once during initialization, and LSM initialization - * is serialized. Landlock is one such case. - * Look at the previous entry, if there is one, for duplication. - */ - if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) { - if (lsm_active_cnt >= MAX_LSM_COUNT) - panic("%s Too many LSMs registered.\n", __func__); - lsm_idlist[lsm_active_cnt++] = lsmid; - } - - for (i = 0; i < count; i++) { - hooks[i].lsmid = lsmid; - lsm_static_call_init(&hooks[i]); + if (!lsm_file_cache) { + file->f_security = NULL; + return 0; } - /* - * Don't try to append during early_security_init(), we'll come back - * and fix this up afterwards. - */ - if (slab_is_available()) { - if (lsm_append(lsmid->name, &lsm_names) < 0) - panic("%s - Cannot get early memory.\n", __func__); - } + file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL); + if (file->f_security == NULL) + return -ENOMEM; + return 0; } /** @@ -679,46 +199,11 @@ static int lsm_blob_alloc(void **dest, size_t size, gfp_t gfp) * * Returns 0, or -ENOMEM if memory can't be allocated. */ -static int lsm_cred_alloc(struct cred *cred, gfp_t gfp) +int lsm_cred_alloc(struct cred *cred, gfp_t gfp) { return lsm_blob_alloc(&cred->security, blob_sizes.lbs_cred, gfp); } -/** - * lsm_early_cred - during initialization allocate a composite cred blob - * @cred: the cred that needs a blob - * - * Allocate the cred blob for all the modules - */ -static void __init lsm_early_cred(struct cred *cred) -{ - int rc = lsm_cred_alloc(cred, GFP_KERNEL); - - if (rc) - panic("%s: Early cred alloc failed.\n", __func__); -} - -/** - * lsm_file_alloc - allocate a composite file blob - * @file: the file that needs a blob - * - * Allocate the file blob for all the modules - * - * Returns 0, or -ENOMEM if memory can't be allocated. - */ -static int lsm_file_alloc(struct file *file) -{ - if (!lsm_file_cache) { - file->f_security = NULL; - return 0; - } - - file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL); - if (file->f_security == NULL) - return -ENOMEM; - return 0; -} - /** * lsm_inode_alloc - allocate a composite inode blob * @inode: the inode that needs a blob @@ -749,7 +234,7 @@ static int lsm_inode_alloc(struct inode *inode, gfp_t gfp) * * Returns 0, or -ENOMEM if memory can't be allocated. */ -static int lsm_task_alloc(struct task_struct *task) +int lsm_task_alloc(struct task_struct *task) { return lsm_blob_alloc(&task->security, blob_sizes.lbs_task, GFP_KERNEL); } @@ -851,20 +336,6 @@ static int lsm_bpf_token_alloc(struct bpf_token *token) } #endif /* CONFIG_BPF_SYSCALL */ -/** - * lsm_early_task - during initialization allocate a composite task blob - * @task: the task that needs a blob - * - * Allocate the task blob for all the modules - */ -static void __init lsm_early_task(struct task_struct *task) -{ - int rc = lsm_task_alloc(task); - - if (rc) - panic("%s: Early task alloc failed.\n", __func__); -} - /** * lsm_superblock_alloc - allocate a composite superblock blob * @sb: the superblock that needs a blob -- cgit v1.2.3 From 9f9dc69e06ecbc61e7a50b823b82a78daf130dc0 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 12 Feb 2025 14:45:06 -0500 Subject: lsm: replace the name field with a pointer to the lsm_id struct Reduce the duplication between the lsm_id struct and the DEFINE_LSM() definition by linking the lsm_id struct directly into the individual LSM's DEFINE_LSM() instance. Linking the lsm_id into the LSM definition also allows us to simplify the security_add_hooks() function by removing the code which populates the lsm_idlist[] array and moving it into the normal LSM startup code where the LSM list is parsed and the individual LSMs are enabled, making for a cleaner implementation with less overhead at boot. Reviewed-by: Kees Cook Reviewed-by: John Johansen Reviewed-by: Casey Schaufler Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 2 +- security/apparmor/lsm.c | 2 +- security/bpf/hooks.c | 2 +- security/commoncap.c | 2 +- security/integrity/evm/evm_main.c | 2 +- security/integrity/ima/ima_main.c | 2 +- security/ipe/ipe.c | 2 +- security/landlock/setup.c | 2 +- security/loadpin/loadpin.c | 2 +- security/lockdown/lockdown.c | 2 +- security/lsm_init.c | 45 ++++++++++++++++----------------------- security/safesetid/lsm.c | 2 +- security/selinux/hooks.c | 2 +- security/smack/smack_lsm.c | 2 +- security/tomoyo/tomoyo.c | 2 +- security/yama/yama_lsm.c | 2 +- 16 files changed, 33 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0112926ed923..7343dd60b1d5 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -152,7 +152,7 @@ enum lsm_order { }; struct lsm_info { - const char *name; /* Required. */ + const struct lsm_id *id; enum lsm_order order; /* Optional: default is LSM_ORDER_MUTABLE */ unsigned long flags; /* Optional: flags describing LSM */ int *enabled; /* Optional: controlled by CONFIG_LSM */ diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index b3f7a3258a2c..f6798144234b 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -2555,7 +2555,7 @@ alloc_out: } DEFINE_LSM(apparmor) = { - .name = "apparmor", + .id = &apparmor_lsmid, .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &apparmor_enabled, .blobs = &apparmor_blob_sizes, diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index db759025abe1..40efde233f3a 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -33,7 +33,7 @@ struct lsm_blob_sizes bpf_lsm_blob_sizes __ro_after_init = { }; DEFINE_LSM(bpf) = { - .name = "bpf", + .id = &bpf_lsmid, .init = bpf_lsm_init, .blobs = &bpf_lsm_blob_sizes }; diff --git a/security/commoncap.c b/security/commoncap.c index 6bd4adeb4795..b50479bd0286 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1505,7 +1505,7 @@ static int __init capability_init(void) } DEFINE_LSM(capability) = { - .name = "capability", + .id = &capability_lsmid, .order = LSM_ORDER_FIRST, .init = capability_init, }; diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index 0add782e73ba..db8e324ed4e6 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -1175,7 +1175,7 @@ struct lsm_blob_sizes evm_blob_sizes __ro_after_init = { }; DEFINE_LSM(evm) = { - .name = "evm", + .id = &evm_lsmid, .init = init_evm_lsm, .order = LSM_ORDER_LAST, .blobs = &evm_blob_sizes, diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index cdd225f65a62..eade8e1e3cb1 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -1279,7 +1279,7 @@ struct lsm_blob_sizes ima_blob_sizes __ro_after_init = { }; DEFINE_LSM(ima) = { - .name = "ima", + .id = &ima_lsmid, .init = init_ima_lsm, .order = LSM_ORDER_LAST, .blobs = &ima_blob_sizes, diff --git a/security/ipe/ipe.c b/security/ipe/ipe.c index 4317134cb0da..2426441181dc 100644 --- a/security/ipe/ipe.c +++ b/security/ipe/ipe.c @@ -92,7 +92,7 @@ static int __init ipe_init(void) } DEFINE_LSM(ipe) = { - .name = "ipe", + .id = &ipe_lsmid, .init = ipe_init, .blobs = &ipe_blobs, }; diff --git a/security/landlock/setup.c b/security/landlock/setup.c index bd53c7a56ab9..47dac1736f10 100644 --- a/security/landlock/setup.c +++ b/security/landlock/setup.c @@ -75,7 +75,7 @@ static int __init landlock_init(void) } DEFINE_LSM(LANDLOCK_NAME) = { - .name = LANDLOCK_NAME, + .id = &landlock_lsmid, .init = landlock_init, .blobs = &landlock_blob_sizes, }; diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c index 68252452b66c..b9ddf05c5c16 100644 --- a/security/loadpin/loadpin.c +++ b/security/loadpin/loadpin.c @@ -271,7 +271,7 @@ static int __init loadpin_init(void) } DEFINE_LSM(loadpin) = { - .name = "loadpin", + .id = &loadpin_lsmid, .init = loadpin_init, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index cf83afa1d879..4813f168ff93 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -168,6 +168,6 @@ DEFINE_EARLY_LSM(lockdown) = { #else DEFINE_LSM(lockdown) = { #endif - .name = "lockdown", + .id = &lockdown_lsmid, .init = lockdown_lsm_init, }; diff --git a/security/lsm_init.c b/security/lsm_init.c index 9249d5f37ae9..692d61a2ea10 100644 --- a/security/lsm_init.c +++ b/security/lsm_init.c @@ -127,9 +127,10 @@ static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from) /* Enable this LSM, if it is not already set. */ if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; - ordered_lsms[last_lsm++] = lsm; + ordered_lsms[last_lsm] = lsm; + lsm_idlist[last_lsm++] = lsm->id; - init_debug("%s ordered: %s (%s)\n", from, lsm->name, + init_debug("%s ordered: %s (%s)\n", from, lsm->id->name, is_enabled(lsm) ? "enabled" : "disabled"); } @@ -157,7 +158,7 @@ static void __init lsm_prepare(struct lsm_info *lsm) set_enabled(lsm, false); return; } else if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) { - init_debug("exclusive disabled: %s\n", lsm->name); + init_debug("exclusive disabled: %s\n", lsm->id->name); set_enabled(lsm, false); return; } @@ -165,7 +166,7 @@ static void __init lsm_prepare(struct lsm_info *lsm) /* Mark the LSM as enabled. */ set_enabled(lsm, true); if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) { - init_debug("exclusive chosen: %s\n", lsm->name); + init_debug("exclusive chosen: %s\n", lsm->id->name); exclusive = lsm; } @@ -200,9 +201,9 @@ static void __init initialize_lsm(struct lsm_info *lsm) if (is_enabled(lsm)) { int ret; - init_debug("initializing %s\n", lsm->name); + init_debug("initializing %s\n", lsm->id->name); ret = lsm->init(); - WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret); + WARN(ret, "%s failed to initialize: %d\n", lsm->id->name, ret); } } @@ -236,10 +237,10 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) */ lsm_for_each_raw(major) { if ((major->flags & LSM_FLAG_LEGACY_MAJOR) && - strcmp(major->name, chosen_major_lsm) != 0) { + strcmp(major->id->name, chosen_major_lsm) != 0) { set_enabled(major, false); init_debug("security=%s disabled: %s (only one legacy major LSM)\n", - chosen_major_lsm, major->name); + chosen_major_lsm, major->id->name); } } } @@ -251,7 +252,7 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) bool found = false; lsm_for_each_raw(lsm) { - if (strcmp(lsm->name, name) == 0) { + if (strcmp(lsm->id->name, name) == 0) { if (lsm->order == LSM_ORDER_MUTABLE) append_ordered_lsm(lsm, origin); found = true; @@ -268,7 +269,7 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) lsm_for_each_raw(lsm) { if (exists_ordered_lsm(lsm)) continue; - if (strcmp(lsm->name, chosen_major_lsm) == 0) + if (strcmp(lsm->id->name, chosen_major_lsm) == 0) append_ordered_lsm(lsm, "security="); } } @@ -285,7 +286,7 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) continue; set_enabled(lsm, false); init_debug("%s skipped: %s (not in requested order)\n", - origin, lsm->name); + origin, lsm->id->name); } kfree(sep); @@ -317,11 +318,13 @@ static void __init lsm_init_ordered(void) pr_info("initializing lsm="); lsm_early_for_each_raw(early) { if (is_enabled(early)) - pr_cont("%s%s", first++ == 0 ? "" : ",", early->name); + pr_cont("%s%s", + first++ == 0 ? "" : ",", early->id->name); } lsm_order_for_each(lsm) { if (is_enabled(*lsm)) - pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name); + pr_cont("%s%s", + first++ == 0 ? "" : ",", (*lsm)->id->name); } pr_cont("\n"); @@ -432,18 +435,6 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count, { int i; - /* - * A security module may call security_add_hooks() more - * than once during initialization, and LSM initialization - * is serialized. Landlock is one such case. - * Look at the previous entry, if there is one, for duplication. - */ - if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) { - if (lsm_active_cnt >= MAX_LSM_COUNT) - panic("%s Too many LSMs registered.\n", __func__); - lsm_idlist[lsm_active_cnt++] = lsmid; - } - for (i = 0; i < count; i++) { hooks[i].lsmid = lsmid; lsm_static_call_init(&hooks[i]); @@ -491,10 +482,10 @@ int __init security_init(void) * available */ lsm_early_for_each_raw(lsm) { - init_debug(" early started: %s (%s)\n", lsm->name, + init_debug(" early started: %s (%s)\n", lsm->id->name, is_enabled(lsm) ? "enabled" : "disabled"); if (lsm->enabled) - lsm_append(lsm->name, &lsm_names); + lsm_append(lsm->id->name, &lsm_names); } /* Load LSMs in specified order. */ diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c index 1ba564f097f5..9a7c68d4e642 100644 --- a/security/safesetid/lsm.c +++ b/security/safesetid/lsm.c @@ -287,6 +287,6 @@ static int __init safesetid_security_init(void) } DEFINE_LSM(safesetid_security_init) = { + .id = &safesetid_lsmid, .init = safesetid_security_init, - .name = "safesetid", }; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index dfc22da42f30..299b656ac007 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7639,7 +7639,7 @@ void selinux_complete_init(void) /* SELinux requires early initialization in order to label all processes and objects when they are created. */ DEFINE_LSM(selinux) = { - .name = "selinux", + .id = &selinux_lsmid, .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &selinux_enabled_boot, .blobs = &selinux_blob_sizes, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index af986587841d..392698e41120 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5280,7 +5280,7 @@ static __init int smack_init(void) * all processes and objects when they are created. */ DEFINE_LSM(smack) = { - .name = "smack", + .id = &smack_lsmid, .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .blobs = &smack_blob_sizes, .init = smack_init, diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 48fc59d38ab2..cb003c460dc2 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -612,7 +612,7 @@ static int __init tomoyo_init(void) } DEFINE_LSM(tomoyo) = { - .name = "tomoyo", + .id = &tomoyo_lsmid, .enabled = &tomoyo_enabled, .flags = LSM_FLAG_LEGACY_MAJOR, .blobs = &tomoyo_blob_sizes, diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 3d064dd4e03f..38b21ee0c560 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -476,6 +476,6 @@ static int __init yama_init(void) } DEFINE_LSM(yama) = { - .name = "yama", + .id = &yama_lsmid, .init = yama_init, }; -- cgit v1.2.3 From 250898ca335f337bc032a9693dc0a30a1cb85825 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 12 Feb 2025 15:36:51 -0500 Subject: lsm: rework lsm_active_cnt and lsm_idlist[] Move the LSM active count and lsm_id list declarations out of a header that is visible across the kernel and into a header that is limited to the LSM framework. This not only helps keep the include/linux headers smaller and cleaner, it helps prevent misuse of these variables. Reviewed-by: Casey Schaufler Reviewed-by: John Johansen Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/security.h | 2 -- security/lsm.h | 5 +++++ security/lsm_init.c | 6 ------ security/lsm_syscalls.c | 2 ++ security/security.c | 3 +++ 5 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 92ac3f27b973..556890ea2e83 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -167,8 +167,6 @@ struct lsm_prop { }; extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1]; -extern u32 lsm_active_cnt; -extern const struct lsm_id *lsm_idlist[]; /* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, diff --git a/security/lsm.h b/security/lsm.h index 0e1731bad4a7..dbe755c45e57 100644 --- a/security/lsm.h +++ b/security/lsm.h @@ -7,6 +7,11 @@ #define _LSM_H_ #include +#include + +/* List of configured LSMs */ +extern unsigned int lsm_active_cnt; +extern const struct lsm_id *lsm_idlist[]; /* LSM blob configuration */ extern struct lsm_blob_sizes blob_sizes; diff --git a/security/lsm_init.c b/security/lsm_init.c index a0785ca081c7..d40f31e79bd5 100644 --- a/security/lsm_init.c +++ b/security/lsm_init.c @@ -217,12 +217,6 @@ static void __init initialize_lsm(struct lsm_info *lsm) } } -/* - * Current index to use while initializing the lsm id list. - */ -u32 lsm_active_cnt __ro_after_init; -const struct lsm_id *lsm_idlist[MAX_LSM_COUNT]; - /* Populate ordered LSMs list from comma-separated LSM name list. */ static void __init ordered_lsm_parse(const char *order, const char *origin) { diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c index 8440948a690c..5648b1f0ce9c 100644 --- a/security/lsm_syscalls.c +++ b/security/lsm_syscalls.c @@ -17,6 +17,8 @@ #include #include +#include "lsm.h" + /** * lsm_name_to_attr - map an LSM attribute name to its ID * @name: name of the attribute diff --git a/security/security.c b/security/security.c index dc9734f0d45c..b4eec4f00730 100644 --- a/security/security.c +++ b/security/security.c @@ -73,6 +73,9 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = { [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; +unsigned int lsm_active_cnt __ro_after_init; +const struct lsm_id *lsm_idlist[MAX_LSM_COUNT]; + struct lsm_blob_sizes blob_sizes; struct kmem_cache *lsm_file_cache; -- cgit v1.2.3 From 935d508d4d7ab9d19c603bd7eb2937249551d507 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 13 Feb 2025 17:34:12 -0500 Subject: lsm: get rid of the lsm_names list and do some cleanup The LSM currently has a lot of code to maintain a list of the currently active LSMs in a human readable string, with the only user being the "/sys/kernel/security/lsm" code. Let's drop all of that code and generate the string on first use and then cache it for subsequent use. Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 1 - security/inode.c | 43 +++++++++++++++++++++++++++++++++++++++-- security/lsm_init.c | 49 ----------------------------------------------- 3 files changed, 41 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 7343dd60b1d5..65a8227bece7 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -172,7 +172,6 @@ struct lsm_info { /* DO NOT tamper with these variables outside of the LSM framework */ -extern char *lsm_names; extern struct lsm_static_calls_table static_calls_table __ro_after_init; /** diff --git a/security/inode.c b/security/inode.c index 43382ef8896e..6620c3e42af2 100644 --- a/security/inode.c +++ b/security/inode.c @@ -22,6 +22,8 @@ #include #include +#include "lsm.h" + static struct vfsmount *mount; static int mount_count; @@ -315,12 +317,49 @@ void securityfs_remove(struct dentry *dentry) EXPORT_SYMBOL_GPL(securityfs_remove); #ifdef CONFIG_SECURITY +#include + static struct dentry *lsm_dentry; + static ssize_t lsm_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - return simple_read_from_buffer(buf, count, ppos, lsm_names, - strlen(lsm_names)); + int i; + static char *str; + static size_t len; + static DEFINE_SPINLOCK(lock); + + /* NOTE: we never free or modify the string once it is set */ + + if (unlikely(!str || !len)) { + char *str_tmp; + size_t len_tmp = 0; + + for (i = 0; i < lsm_active_cnt; i++) + /* the '+ 1' accounts for either a comma or a NUL */ + len_tmp += strlen(lsm_idlist[i]->name) + 1; + + str_tmp = kmalloc(len_tmp, GFP_KERNEL); + if (!str_tmp) + return -ENOMEM; + str_tmp[0] = '\0'; + + for (i = 0; i < lsm_active_cnt; i++) { + if (i > 0) + strcat(str_tmp, ","); + strcat(str_tmp, lsm_idlist[i]->name); + } + + spin_lock(&lock); + if (!str) { + str = str_tmp; + len = len_tmp - 1; + } else + kfree(str_tmp); + spin_unlock(&lock); + } + + return simple_read_from_buffer(buf, count, ppos, str, len); } static const struct file_operations lsm_ops = { diff --git a/security/lsm_init.c b/security/lsm_init.c index d40f31e79bd5..574fff354d3f 100644 --- a/security/lsm_init.c +++ b/security/lsm_init.c @@ -10,8 +10,6 @@ #include "lsm.h" -char *lsm_names; - /* Pointers to LSM sections defined in include/asm-generic/vmlinux.lds.h */ extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; @@ -371,42 +369,6 @@ static void __init lsm_init_ordered(void) } } -static bool match_last_lsm(const char *list, const char *lsm) -{ - const char *last; - - if (WARN_ON(!list || !lsm)) - return false; - last = strrchr(list, ','); - if (last) - /* Pass the comma, strcmp() will check for '\0' */ - last++; - else - last = list; - return !strcmp(last, lsm); -} - -static int lsm_append(const char *new, char **result) -{ - char *cp; - - if (*result == NULL) { - *result = kstrdup(new, GFP_KERNEL); - if (*result == NULL) - return -ENOMEM; - } else { - /* Check if it is the last registered name */ - if (match_last_lsm(*result, new)) - return 0; - cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new); - if (cp == NULL) - return -ENOMEM; - kfree(*result); - *result = cp; - } - return 0; -} - static void __init lsm_static_call_init(struct security_hook_list *hl) { struct lsm_static_call *scall = hl->scalls; @@ -443,15 +405,6 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count, hooks[i].lsmid = lsmid; lsm_static_call_init(&hooks[i]); } - - /* - * Don't try to append during early_security_init(), we'll come back - * and fix this up afterwards. - */ - if (slab_is_available()) { - if (lsm_append(lsmid->name, &lsm_names) < 0) - panic("%s - Cannot get early memory.\n", __func__); - } } int __init early_security_init(void) @@ -488,8 +441,6 @@ int __init security_init(void) lsm_early_for_each_raw(lsm) { init_debug(" early started: %s (%s)\n", lsm->id->name, is_enabled(lsm) ? "enabled" : "disabled"); - if (lsm->enabled) - lsm_append(lsm->id->name, &lsm_names); } /* Load LSMs in specified order. */ -- cgit v1.2.3 From 291271e691740003021cf5b48fa7cf7e3371eaa7 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 11 Feb 2025 17:49:11 -0500 Subject: lsm: cleanup the LSM blob size code Convert the lsm_blob_size fields to unsigned integers as there is no current need for them to be negative, change "lsm_set_blob_size()" to "lsm_blob_size_update()" to better reflect reality, and perform some other minor cleanups to the associated code. Reviewed-by: Kees Cook Reviewed-by: John Johansen Reviewed-by: Casey Schaufler Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 34 ++++++++++++++-------------- security/lsm_init.c | 57 +++++++++++++++++++++++++++-------------------- 2 files changed, 50 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 65a8227bece7..86e457aa8809 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -102,23 +102,23 @@ struct security_hook_list { * Security blob size or offset data. */ struct lsm_blob_sizes { - int lbs_cred; - int lbs_file; - int lbs_ib; - int lbs_inode; - int lbs_sock; - int lbs_superblock; - int lbs_ipc; - int lbs_key; - int lbs_msg_msg; - int lbs_perf_event; - int lbs_task; - int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ - int lbs_tun_dev; - int lbs_bdev; - int lbs_bpf_map; - int lbs_bpf_prog; - int lbs_bpf_token; + unsigned int lbs_cred; + unsigned int lbs_file; + unsigned int lbs_ib; + unsigned int lbs_inode; + unsigned int lbs_sock; + unsigned int lbs_superblock; + unsigned int lbs_ipc; + unsigned int lbs_key; + unsigned int lbs_msg_msg; + unsigned int lbs_perf_event; + unsigned int lbs_task; + unsigned int lbs_xattr_count; /* num xattr slots in new_xattrs array */ + unsigned int lbs_tun_dev; + unsigned int lbs_bdev; + unsigned int lbs_bpf_map; + unsigned int lbs_bpf_prog; + unsigned int lbs_bpf_token; }; /* diff --git a/security/lsm_init.c b/security/lsm_init.c index f0066857bd1a..6b1f8f18a43c 100644 --- a/security/lsm_init.c +++ b/security/lsm_init.c @@ -169,16 +169,22 @@ out: lsm_is_enabled(lsm) ? "enabled" : "disabled"); } -static void __init lsm_set_blob_size(int *need, int *lbs) +/** + * lsm_blob_size_update - Update the LSM blob size and offset information + * @sz_req: the requested additional blob size + * @sz_cur: the existing blob size + */ +static void __init lsm_blob_size_update(unsigned int *sz_req, + unsigned int *sz_cur) { - int offset; + unsigned int offset; - if (*need <= 0) + if (*sz_req == 0) return; - offset = ALIGN(*lbs, sizeof(void *)); - *lbs = offset + *need; - *need = offset; + offset = ALIGN(*sz_cur, sizeof(void *)); + *sz_cur = offset + *sz_req; + *sz_req = offset; } /** @@ -193,27 +199,30 @@ static void __init lsm_prepare(struct lsm_info *lsm) return; /* Register the LSM blob sizes. */ - lsm_set_blob_size(&blobs->lbs_cred, &blob_sizes.lbs_cred); - lsm_set_blob_size(&blobs->lbs_file, &blob_sizes.lbs_file); - lsm_set_blob_size(&blobs->lbs_ib, &blob_sizes.lbs_ib); + blobs = lsm->blobs; + lsm_blob_size_update(&blobs->lbs_cred, &blob_sizes.lbs_cred); + lsm_blob_size_update(&blobs->lbs_file, &blob_sizes.lbs_file); + lsm_blob_size_update(&blobs->lbs_ib, &blob_sizes.lbs_ib); /* inode blob gets an rcu_head in addition to LSM blobs. */ if (blobs->lbs_inode && blob_sizes.lbs_inode == 0) blob_sizes.lbs_inode = sizeof(struct rcu_head); - lsm_set_blob_size(&blobs->lbs_inode, &blob_sizes.lbs_inode); - lsm_set_blob_size(&blobs->lbs_ipc, &blob_sizes.lbs_ipc); - lsm_set_blob_size(&blobs->lbs_key, &blob_sizes.lbs_key); - lsm_set_blob_size(&blobs->lbs_msg_msg, &blob_sizes.lbs_msg_msg); - lsm_set_blob_size(&blobs->lbs_perf_event, &blob_sizes.lbs_perf_event); - lsm_set_blob_size(&blobs->lbs_sock, &blob_sizes.lbs_sock); - lsm_set_blob_size(&blobs->lbs_superblock, &blob_sizes.lbs_superblock); - lsm_set_blob_size(&blobs->lbs_task, &blob_sizes.lbs_task); - lsm_set_blob_size(&blobs->lbs_tun_dev, &blob_sizes.lbs_tun_dev); - lsm_set_blob_size(&blobs->lbs_xattr_count, - &blob_sizes.lbs_xattr_count); - lsm_set_blob_size(&blobs->lbs_bdev, &blob_sizes.lbs_bdev); - lsm_set_blob_size(&blobs->lbs_bpf_map, &blob_sizes.lbs_bpf_map); - lsm_set_blob_size(&blobs->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog); - lsm_set_blob_size(&blobs->lbs_bpf_token, &blob_sizes.lbs_bpf_token); + lsm_blob_size_update(&blobs->lbs_inode, &blob_sizes.lbs_inode); + lsm_blob_size_update(&blobs->lbs_ipc, &blob_sizes.lbs_ipc); + lsm_blob_size_update(&blobs->lbs_key, &blob_sizes.lbs_key); + lsm_blob_size_update(&blobs->lbs_msg_msg, &blob_sizes.lbs_msg_msg); + lsm_blob_size_update(&blobs->lbs_perf_event, + &blob_sizes.lbs_perf_event); + lsm_blob_size_update(&blobs->lbs_sock, &blob_sizes.lbs_sock); + lsm_blob_size_update(&blobs->lbs_superblock, + &blob_sizes.lbs_superblock); + lsm_blob_size_update(&blobs->lbs_task, &blob_sizes.lbs_task); + lsm_blob_size_update(&blobs->lbs_tun_dev, &blob_sizes.lbs_tun_dev); + lsm_blob_size_update(&blobs->lbs_xattr_count, + &blob_sizes.lbs_xattr_count); + lsm_blob_size_update(&blobs->lbs_bdev, &blob_sizes.lbs_bdev); + lsm_blob_size_update(&blobs->lbs_bpf_map, &blob_sizes.lbs_bpf_map); + lsm_blob_size_update(&blobs->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog); + lsm_blob_size_update(&blobs->lbs_bpf_token, &blob_sizes.lbs_bpf_token); } /* Initialize a given LSM, if it is enabled. */ -- cgit v1.2.3 From cdc028812f727907d1575cf454a5f01ddffa7750 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 11 Feb 2025 12:18:35 -0500 Subject: lsm: introduce an initcall mechanism into the LSM framework Currently the individual LSMs register their own initcalls, and while this should be harmless, it can be wasteful in the case where a LSM is disabled at boot as the initcall will still be executed. This patch introduces support for managing the initcalls in the LSM framework, and future patches will convert the existing LSMs over to this new mechanism. Only initcall types which are used by the current in-tree LSMs are supported, additional initcall types can easily be added in the future if needed. Reviewed-by: Kees Cook Reviewed-by: Casey Schaufler Reviewed-by: John Johansen Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 33 +++++++++++++++--- security/lsm_init.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 86e457aa8809..b92008641242 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -151,13 +151,36 @@ enum lsm_order { LSM_ORDER_LAST = 1, /* This is only for integrity. */ }; +/** + * struct lsm_info - Define an individual LSM for the LSM framework. + * @id: LSM name/ID info + * @order: ordering with respect to other LSMs, optional + * @flags: descriptive flags, optional + * @blobs: LSM blob sharing, optional + * @enabled: controlled by CONFIG_LSM, optional + * @init: LSM specific initialization routine + * @initcall_pure: LSM callback for initcall_pure() setup, optional + * @initcall_early: LSM callback for early_initcall setup, optional + * @initcall_core: LSM callback for core_initcall() setup, optional + * @initcall_subsys: LSM callback for subsys_initcall() setup, optional + * @initcall_fs: LSM callback for fs_initcall setup, optional + * @nitcall_device: LSM callback for device_initcall() setup, optional + * @initcall_late: LSM callback for late_initcall() setup, optional + */ struct lsm_info { const struct lsm_id *id; - enum lsm_order order; /* Optional: default is LSM_ORDER_MUTABLE */ - unsigned long flags; /* Optional: flags describing LSM */ - int *enabled; /* Optional: controlled by CONFIG_LSM */ - int (*init)(void); /* Required. */ - struct lsm_blob_sizes *blobs; /* Optional: for blob sharing. */ + enum lsm_order order; + unsigned long flags; + struct lsm_blob_sizes *blobs; + int *enabled; + int (*init)(void); + int (*initcall_pure)(void); + int (*initcall_early)(void); + int (*initcall_core)(void); + int (*initcall_subsys)(void); + int (*initcall_fs)(void); + int (*initcall_device)(void); + int (*initcall_late)(void); }; #define DEFINE_LSM(lsm) \ diff --git a/security/lsm_init.c b/security/lsm_init.c index fd69bde9112e..aacdac406ba5 100644 --- a/security/lsm_init.c +++ b/security/lsm_init.c @@ -39,6 +39,27 @@ static __initdata struct lsm_info *lsm_exclusive; for ((iter) = __start_early_lsm_info; \ (iter) < __end_early_lsm_info; (iter)++) +#define lsm_initcall(level) \ + ({ \ + int _r, _rc = 0; \ + struct lsm_info **_lp, *_l; \ + lsm_order_for_each(_lp) { \ + _l = *_lp; \ + if (!_l->initcall_##level) \ + continue; \ + lsm_pr_dbg("running %s %s initcall", \ + _l->id->name, #level); \ + _r = _l->initcall_##level(); \ + if (_r) { \ + pr_warn("failed LSM %s %s initcall with errno %d\n", \ + _l->id->name, #level, _r); \ + if (!_rc) \ + _rc = _r; \ + } \ + } \ + _rc; \ + }) + /** * lsm_choose_security - Legacy "major" LSM selection * @str: kernel command line parameter @@ -461,3 +482,71 @@ int __init security_init(void) return 0; } + +/** + * security_initcall_pure - Run the LSM pure initcalls + */ +static int __init security_initcall_pure(void) +{ + return lsm_initcall(pure); +} +pure_initcall(security_initcall_pure); + +/** + * security_initcall_early - Run the LSM early initcalls + */ +static int __init security_initcall_early(void) +{ + return lsm_initcall(early); +} +early_initcall(security_initcall_early); + +/** + * security_initcall_core - Run the LSM core initcalls + */ +static int __init security_initcall_core(void) +{ + return lsm_initcall(core); +} +core_initcall(security_initcall_core); + +/** + * security_initcall_subsys - Run the LSM subsys initcalls + */ +static int __init security_initcall_subsys(void) +{ + return lsm_initcall(subsys); +} +subsys_initcall(security_initcall_subsys); + +/** + * security_initcall_fs - Run the LSM fs initcalls + */ +static int __init security_initcall_fs(void) +{ + return lsm_initcall(fs); +} +fs_initcall(security_initcall_fs); + +/** + * security_initcall_device - Run the LSM device initcalls + */ +static int __init security_initcall_device(void) +{ + return lsm_initcall(device); +} +device_initcall(security_initcall_device); + +/** + * security_initcall_late - Run the LSM late initcalls + */ +static int __init security_initcall_late(void) +{ + int rc; + + rc = lsm_initcall(late); + lsm_pr_dbg("all enabled LSMs fully activated\n"); + + return rc; +} +late_initcall(security_initcall_late); -- cgit v1.2.3 From dfa024bc3f67a97e1a975dd66b83af8b3845eb19 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 21 Feb 2025 11:53:29 -0500 Subject: lsm: add a LSM_STARTED_ALL notification event Add a new LSM notifier event, LSM_STARTED_ALL, which is fired once at boot when all of the LSMs have been started. Reviewed-by: Kees Cook Reviewed-by: Casey Schaufler Reviewed-by: John Johansen Signed-off-by: Paul Moore --- include/linux/security.h | 1 + security/lsm_init.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 556890ea2e83..eb36451ce41f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -85,6 +85,7 @@ struct timezone; enum lsm_event { LSM_POLICY_CHANGE, + LSM_STARTED_ALL, }; struct dm_verity_digest { diff --git a/security/lsm_init.c b/security/lsm_init.c index 0f668bca98f9..6bb67d41ce52 100644 --- a/security/lsm_init.c +++ b/security/lsm_init.c @@ -556,6 +556,7 @@ static int __init security_initcall_late(void) rc = lsm_initcall(late); lsm_pr_dbg("all enabled LSMs fully activated\n"); + call_blocking_lsm_notifier(LSM_STARTED_ALL, NULL); return rc; } -- cgit v1.2.3 From 094e94d13b606b820e3d1383e3a361f680ff023a Mon Sep 17 00:00:00 2001 From: Thiébaud Weksteen Date: Thu, 18 Sep 2025 12:04:34 +1000 Subject: memfd,selinux: call security_inode_init_security_anon() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prior to this change, no security hooks were called at the creation of a memfd file. It means that, for SELinux as an example, it will receive the default type of the filesystem that backs the in-memory inode. In most cases, that would be tmpfs, but if MFD_HUGETLB is passed, it will be hugetlbfs. Both can be considered implementation details of memfd. It also means that it is not possible to differentiate between a file coming from memfd_create and a file coming from a standard tmpfs mount point. Additionally, no permission is validated at creation, which differs from the similar memfd_secret syscall. Call security_inode_init_security_anon during creation. This ensures that the file is setup similarly to other anonymous inodes. On SELinux, it means that the file will receive the security context of its task. The ability to limit fexecve on memfd has been of interest to avoid potential pitfalls where /proc/self/exe or similar would be executed [1][2]. Reuse the "execute_no_trans" and "entrypoint" access vectors, similarly to the file class. These access vectors may not make sense for the existing "anon_inode" class. Therefore, define and assign a new class "memfd_file" to support such access vectors. Guard these changes behind a new policy capability named "memfd_class". [1] https://crbug.com/1305267 [2] https://lore.kernel.org/lkml/20221215001205.51969-1-jeffxu@google.com/ Signed-off-by: Thiébaud Weksteen Reviewed-by: Stephen Smalley Tested-by: Stephen Smalley Acked-by: Hugh Dickins [PM: subj tweak] Signed-off-by: Paul Moore --- include/linux/memfd.h | 2 ++ mm/memfd.c | 14 ++++++++++++-- security/selinux/hooks.c | 26 +++++++++++++++++++++----- security/selinux/include/classmap.h | 2 ++ security/selinux/include/policycap.h | 1 + security/selinux/include/policycap_names.h | 1 + security/selinux/include/security.h | 5 +++++ 7 files changed, 44 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 6f606d9573c3..cc74de3dbcfe 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -4,6 +4,8 @@ #include +#define MEMFD_ANON_NAME "[memfd]" + #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); diff --git a/mm/memfd.c b/mm/memfd.c index 1d109c1acf21..a61acbe5ded3 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -433,6 +433,8 @@ static struct file *alloc_file(const char *name, unsigned int flags) { unsigned int *file_seals; struct file *file; + struct inode *inode; + int err = 0; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, @@ -444,12 +446,20 @@ static struct file *alloc_file(const char *name, unsigned int flags) } if (IS_ERR(file)) return file; + + inode = file_inode(file); + err = security_inode_init_security_anon(inode, + &QSTR(MEMFD_ANON_NAME), NULL); + if (err) { + fput(file); + file = ERR_PTR(err); + return file; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; if (flags & MFD_NOEXEC_SEAL) { - struct inode *inode = file_inode(file); - inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); if (file_seals) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index dfc22da42f30..a22b1920242f 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -93,6 +93,7 @@ #include #include #include +#include #include "avc.h" #include "objsec.h" @@ -2319,6 +2320,10 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) new_tsec = selinux_cred(bprm->cred); isec = inode_security(inode); + if (WARN_ON(isec->sclass != SECCLASS_FILE && + isec->sclass != SECCLASS_MEMFD_FILE)) + return -EACCES; + /* Default to the current task SID. */ new_tsec->sid = old_tsec->sid; new_tsec->osid = old_tsec->sid; @@ -2371,8 +2376,8 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) ad.u.file = bprm->file; if (new_tsec->sid == old_tsec->sid) { - rc = avc_has_perm(old_tsec->sid, isec->sid, - SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad); + rc = avc_has_perm(old_tsec->sid, isec->sid, isec->sclass, + FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { @@ -2382,8 +2387,8 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) if (rc) return rc; - rc = avc_has_perm(new_tsec->sid, isec->sid, - SECCLASS_FILE, FILE__ENTRYPOINT, &ad); + rc = avc_has_perm(new_tsec->sid, isec->sid, isec->sclass, + FILE__ENTRYPOINT, &ad); if (rc) return rc; @@ -2978,10 +2983,18 @@ static int selinux_inode_init_security_anon(struct inode *inode, struct common_audit_data ad; struct inode_security_struct *isec; int rc; + bool is_memfd = false; if (unlikely(!selinux_initialized())) return 0; + if (name != NULL && name->name != NULL && + !strcmp(name->name, MEMFD_ANON_NAME)) { + if (!selinux_policycap_memfd_class()) + return 0; + is_memfd = true; + } + isec = selinux_inode(inode); /* @@ -3001,7 +3014,10 @@ static int selinux_inode_init_security_anon(struct inode *inode, isec->sclass = context_isec->sclass; isec->sid = context_isec->sid; } else { - isec->sclass = SECCLASS_ANON_INODE; + if (is_memfd) + isec->sclass = SECCLASS_MEMFD_FILE; + else + isec->sclass = SECCLASS_ANON_INODE; rc = security_transition_sid( sid, sid, isec->sclass, name, &isec->sid); diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 5665aa5e7853..3ec85142771f 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -179,6 +179,8 @@ const struct security_class_mapping secclass_map[] = { { "anon_inode", { COMMON_FILE_PERMS, NULL } }, { "io_uring", { "override_creds", "sqpoll", "cmd", "allowed", NULL } }, { "user_namespace", { "create", NULL } }, + { "memfd_file", + { COMMON_FILE_PERMS, "execute_no_trans", "entrypoint", NULL } }, /* last one */ { NULL, {} } }; diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h index 135a969f873c..231d02227e59 100644 --- a/security/selinux/include/policycap.h +++ b/security/selinux/include/policycap.h @@ -18,6 +18,7 @@ enum { POLICYDB_CAP_NETIF_WILDCARD, POLICYDB_CAP_GENFS_SECLABEL_WILDCARD, POLICYDB_CAP_FUNCTIONFS_SECLABEL, + POLICYDB_CAP_MEMFD_CLASS, __POLICYDB_CAP_MAX }; #define POLICYDB_CAP_MAX (__POLICYDB_CAP_MAX - 1) diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h index ff8882887651..454dab37bda3 100644 --- a/security/selinux/include/policycap_names.h +++ b/security/selinux/include/policycap_names.h @@ -21,6 +21,7 @@ const char *const selinux_policycap_names[__POLICYDB_CAP_MAX] = { "netif_wildcard", "genfs_seclabel_wildcard", "functionfs_seclabel", + "memfd_class", }; /* clang-format on */ diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 0f954a40d3fc..5d1dad8058b1 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -209,6 +209,11 @@ static inline bool selinux_policycap_functionfs_seclabel(void) selinux_state.policycap[POLICYDB_CAP_FUNCTIONFS_SECLABEL]); } +static inline bool selinux_policycap_memfd_class(void) +{ + return READ_ONCE(selinux_state.policycap[POLICYDB_CAP_MEMFD_CLASS]); +} + struct selinux_policy_convert_data; struct selinux_load_state { -- cgit v1.2.3 From 26ab9830beabda863766be4a79dc590c7645f4d9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 21 Oct 2025 08:26:49 +0100 Subject: net: stmmac: replace has_xxxx with core_type Replace the has_gmac, has_gmac4 and has_xgmac ints, of which only one can be set when matching a core to its driver backend, with an enumerated type carrying the DWMAC core type. Tested-by: Maxime Chevallier Signed-off-by: Russell King (Oracle) Acked-by: Chen-Yu Tsai Reviewed-by: Maxime Chevallier Tested-by: Mohd Ayaan Anwar Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/E1vB6ld-0000000BIPy-2Qi4@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/common.h | 5 ++ .../ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 5 +- .../net/ethernet/stmicro/stmmac/dwmac-ipq806x.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c | 2 +- .../ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 4 +- drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c | 2 +- drivers/net/ethernet/stmicro/stmmac/hwif.c | 73 ++++++++-------------- drivers/net/ethernet/stmicro/stmmac/stmmac_est.c | 4 +- .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 13 ++-- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 34 +++++----- drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 14 ++--- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 4 +- .../net/ethernet/stmicro/stmmac/stmmac_platform.c | 9 ++- drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c | 4 +- include/linux/stmmac.h | 11 +++- 21 files changed, 94 insertions(+), 104 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index fee7021246b1..31254ba525d5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -43,6 +43,11 @@ #define DWXGMAC_ID 0x76 #define DWXLGMAC_ID 0x27 +static inline bool dwmac_is_xmac(enum dwmac_core_type core_type) +{ + return core_type == DWMAC_CORE_GMAC4 || core_type == DWMAC_CORE_XGMAC; +} + #define STMMAC_CHAN0 0 /* Always supported and default for all chips */ /* TX and RX Descriptor Length, these need to be power of two. diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index f1c2e35badf7..c7cd6497d42d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -109,7 +109,7 @@ static int dwc_eth_dwmac_config_dt(struct platform_device *pdev, } /* dwc-qos needs GMAC4, AAL, TSO and PMT */ - plat_dat->has_gmac4 = 1; + plat_dat->core_type = DWMAC_CORE_GMAC4; plat_dat->dma_cfg->aal = 1; plat_dat->flags |= STMMAC_FLAG_TSO_EN; plat_dat->pmt = 1; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index e74d00984b88..b2194e414ec1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -565,7 +565,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat) { /* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */ plat->clk_csr = STMMAC_CSR_20_35M; - plat->has_gmac = 1; + plat->core_type = DWMAC_CORE_GMAC; plat->force_sf_dma_mode = 1; plat->mdio_bus_data->needs_reset = true; @@ -612,8 +612,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->pdev = pdev; plat->phy_addr = -1; plat->clk_csr = STMMAC_CSR_250_300M; - plat->has_gmac = 0; - plat->has_gmac4 = 1; + plat->core_type = DWMAC_CORE_GMAC4; plat->force_sf_dma_mode = 0; plat->flags |= (STMMAC_FLAG_TSO_EN | STMMAC_FLAG_SPH_DISABLE); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c index ca4035cbb55b..c05f85534f0c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c @@ -473,7 +473,7 @@ static int ipq806x_gmac_probe(struct platform_device *pdev) return err; } - plat_dat->has_gmac = true; + plat_dat->core_type = DWMAC_CORE_GMAC; plat_dat->bsp_priv = gmac; plat_dat->set_clk_tx_rate = ipq806x_gmac_set_clk_tx_rate; plat_dat->multicast_filter_bins = 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 592aa9d636e5..2a3ac0136cdb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -92,7 +92,7 @@ static void loongson_default_data(struct pci_dev *pdev, /* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */ plat->clk_csr = STMMAC_CSR_20_35M; - plat->has_gmac = 1; + plat->core_type = DWMAC_CORE_GMAC; plat->force_sf_dma_mode = 1; /* Set default value for multicast hash bins */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c index 2562a6d036a2..6fffc9dfbae5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c @@ -41,7 +41,7 @@ static int lpc18xx_dwmac_probe(struct platform_device *pdev) if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); - plat_dat->has_gmac = true; + plat_dat->core_type = DWMAC_CORE_GMAC; reg = syscon_regmap_lookup_by_compatible("nxp,lpc1850-creg"); if (IS_ERR(reg)) { diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 32244217d952..d1e48b524d7a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -846,7 +846,7 @@ static int qcom_ethqos_probe(struct platform_device *pdev) plat_dat->fix_mac_speed = ethqos_fix_mac_speed; plat_dat->dump_debug_regs = rgmii_dump; plat_dat->ptp_clk_freq_config = ethqos_ptp_clk_freq_config; - plat_dat->has_gmac4 = 1; + plat_dat->core_type = DWMAC_CORE_GMAC4; if (ethqos->has_emac_ge_3) plat_dat->dwmac4_addrs = &data->dwmac4_addrs; plat_dat->pmt = 1; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index 51ea0caf16c1..9b92f4d335cc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -1750,8 +1750,8 @@ static int rk_gmac_probe(struct platform_device *pdev) /* If the stmmac is not already selected as gmac4, * then make sure we fallback to gmac. */ - if (!plat_dat->has_gmac4) { - plat_dat->has_gmac = true; + if (plat_dat->core_type != DWMAC_CORE_GMAC4) { + plat_dat->core_type = DWMAC_CORE_GMAC; plat_dat->rx_fifo_size = 4096; plat_dat->tx_fifo_size = 2048; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c index 221539d760bc..ee095ac13203 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c @@ -146,7 +146,7 @@ static int s32_dwmac_probe(struct platform_device *pdev) gmac->ioaddr = res.addr; /* S32CC core feature set */ - plat->has_gmac4 = true; + plat->core_type = DWMAC_CORE_GMAC4; plat->pmt = 1; plat->flags |= STMMAC_FLAG_SPH_DISABLE; plat->rx_fifo_size = 20480; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 354f01184e6c..2ff5db6d41ca 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -497,7 +497,7 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) plat_dat->pcs_init = socfpga_dwmac_pcs_init; plat_dat->pcs_exit = socfpga_dwmac_pcs_exit; plat_dat->select_pcs = socfpga_dwmac_select_pcs; - plat_dat->has_gmac = true; + plat_dat->core_type = DWMAC_CORE_GMAC; plat_dat->riwt_off = 1; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c index 1eadcf5d1ad6..7f560d78209d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c @@ -136,7 +136,7 @@ static int sun7i_gmac_probe(struct platform_device *pdev) /* platform data specifying hardware features and callbacks. * hardware features were copied from Allwinner drivers. */ plat_dat->tx_coe = 1; - plat_dat->has_gmac = true; + plat_dat->core_type = DWMAC_CORE_GMAC; plat_dat->bsp_priv = gmac; plat_dat->init = sun7i_gmac_init; plat_dat->exit = sun7i_gmac_exit; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c index dc903b846b1b..d765acbe3754 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c @@ -308,7 +308,7 @@ static int tegra_mgbe_probe(struct platform_device *pdev) goto disable_clks; } - plat->has_xgmac = 1; + plat->core_type = DWMAC_CORE_XGMAC; plat->flags |= STMMAC_FLAG_TSO_EN; plat->pmt = 1; plat->bsp_priv = mgbe; diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index 3f7c765dcb79..00083ce52549 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -106,9 +106,7 @@ int stmmac_reset(struct stmmac_priv *priv, void __iomem *ioaddr) } static const struct stmmac_hwif_entry { - bool gmac; - bool gmac4; - bool xgmac; + enum dwmac_core_type core_type; u32 min_id; u32 dev_id; const struct stmmac_regs_off regs; @@ -127,9 +125,7 @@ static const struct stmmac_hwif_entry { } stmmac_hw[] = { /* NOTE: New HW versions shall go to the end of this table */ { - .gmac = false, - .gmac4 = false, - .xgmac = false, + .core_type = DWMAC_CORE_MAC100, .min_id = 0, .regs = { .ptp_off = PTP_GMAC3_X_OFFSET, @@ -146,9 +142,7 @@ static const struct stmmac_hwif_entry { .setup = dwmac100_setup, .quirks = stmmac_dwmac1_quirks, }, { - .gmac = true, - .gmac4 = false, - .xgmac = false, + .core_type = DWMAC_CORE_GMAC, .min_id = 0, .regs = { .ptp_off = PTP_GMAC3_X_OFFSET, @@ -165,9 +159,7 @@ static const struct stmmac_hwif_entry { .setup = dwmac1000_setup, .quirks = stmmac_dwmac1_quirks, }, { - .gmac = false, - .gmac4 = true, - .xgmac = false, + .core_type = DWMAC_CORE_GMAC4, .min_id = 0, .regs = { .ptp_off = PTP_GMAC4_OFFSET, @@ -187,9 +179,7 @@ static const struct stmmac_hwif_entry { .setup = dwmac4_setup, .quirks = stmmac_dwmac4_quirks, }, { - .gmac = false, - .gmac4 = true, - .xgmac = false, + .core_type = DWMAC_CORE_GMAC4, .min_id = DWMAC_CORE_4_00, .regs = { .ptp_off = PTP_GMAC4_OFFSET, @@ -210,9 +200,7 @@ static const struct stmmac_hwif_entry { .setup = dwmac4_setup, .quirks = NULL, }, { - .gmac = false, - .gmac4 = true, - .xgmac = false, + .core_type = DWMAC_CORE_GMAC4, .min_id = DWMAC_CORE_4_10, .regs = { .ptp_off = PTP_GMAC4_OFFSET, @@ -233,9 +221,7 @@ static const struct stmmac_hwif_entry { .setup = dwmac4_setup, .quirks = NULL, }, { - .gmac = false, - .gmac4 = true, - .xgmac = false, + .core_type = DWMAC_CORE_GMAC4, .min_id = DWMAC_CORE_5_10, .regs = { .ptp_off = PTP_GMAC4_OFFSET, @@ -256,9 +242,7 @@ static const struct stmmac_hwif_entry { .setup = dwmac4_setup, .quirks = NULL, }, { - .gmac = false, - .gmac4 = false, - .xgmac = true, + .core_type = DWMAC_CORE_XGMAC, .min_id = DWXGMAC_CORE_2_10, .dev_id = DWXGMAC_ID, .regs = { @@ -280,9 +264,7 @@ static const struct stmmac_hwif_entry { .setup = dwxgmac2_setup, .quirks = NULL, }, { - .gmac = false, - .gmac4 = false, - .xgmac = true, + .core_type = DWMAC_CORE_XGMAC, .min_id = DWXLGMAC_CORE_2_00, .dev_id = DWXLGMAC_ID, .regs = { @@ -308,20 +290,18 @@ static const struct stmmac_hwif_entry { int stmmac_hwif_init(struct stmmac_priv *priv) { - bool needs_xgmac = priv->plat->has_xgmac; - bool needs_gmac4 = priv->plat->has_gmac4; - bool needs_gmac = priv->plat->has_gmac; + enum dwmac_core_type core_type = priv->plat->core_type; const struct stmmac_hwif_entry *entry; struct mac_device_info *mac; bool needs_setup = true; u32 id, dev_id = 0; int i, ret; - if (needs_gmac) { + if (core_type == DWMAC_CORE_GMAC) { id = stmmac_get_id(priv, GMAC_VERSION); - } else if (needs_gmac4 || needs_xgmac) { + } else if (dwmac_is_xmac(core_type)) { id = stmmac_get_id(priv, GMAC4_VERSION); - if (needs_xgmac) + if (core_type == DWMAC_CORE_XGMAC) dev_id = stmmac_get_dev_id(priv, GMAC4_VERSION); } else { id = 0; @@ -331,14 +311,16 @@ int stmmac_hwif_init(struct stmmac_priv *priv) priv->synopsys_id = id; /* Lets assume some safe values first */ - priv->ptpaddr = priv->ioaddr + - (needs_gmac4 ? PTP_GMAC4_OFFSET : PTP_GMAC3_X_OFFSET); - priv->mmcaddr = priv->ioaddr + - (needs_gmac4 ? MMC_GMAC4_OFFSET : MMC_GMAC3_X_OFFSET); - if (needs_gmac4) + if (core_type == DWMAC_CORE_GMAC4) { + priv->ptpaddr = priv->ioaddr + PTP_GMAC4_OFFSET; + priv->mmcaddr = priv->ioaddr + MMC_GMAC4_OFFSET; priv->estaddr = priv->ioaddr + EST_GMAC4_OFFSET; - else if (needs_xgmac) - priv->estaddr = priv->ioaddr + EST_XGMAC_OFFSET; + } else { + priv->ptpaddr = priv->ioaddr + PTP_GMAC3_X_OFFSET; + priv->mmcaddr = priv->ioaddr + MMC_GMAC3_X_OFFSET; + if (core_type == DWMAC_CORE_XGMAC) + priv->estaddr = priv->ioaddr + EST_XGMAC_OFFSET; + } /* Check for HW specific setup first */ if (priv->plat->setup) { @@ -355,16 +337,12 @@ int stmmac_hwif_init(struct stmmac_priv *priv) for (i = ARRAY_SIZE(stmmac_hw) - 1; i >= 0; i--) { entry = &stmmac_hw[i]; - if (needs_gmac ^ entry->gmac) - continue; - if (needs_gmac4 ^ entry->gmac4) - continue; - if (needs_xgmac ^ entry->xgmac) + if (core_type != entry->core_type) continue; /* Use synopsys_id var because some setups can override this */ if (priv->synopsys_id < entry->min_id) continue; - if (needs_xgmac && (dev_id ^ entry->dev_id)) + if (core_type == DWMAC_CORE_XGMAC && (dev_id ^ entry->dev_id)) continue; /* Only use generic HW helpers if needed */ @@ -400,6 +378,7 @@ int stmmac_hwif_init(struct stmmac_priv *priv) } dev_err(priv->device, "Failed to find HW IF (id=0x%x, gmac=%d/%d)\n", - id, needs_gmac, needs_gmac4); + id, core_type == DWMAC_CORE_GMAC, + core_type == DWMAC_CORE_GMAC4); return -EINVAL; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_est.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_est.c index 4b513d27a988..afc516059b89 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_est.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_est.c @@ -53,7 +53,7 @@ static int est_configure(struct stmmac_priv *priv, struct stmmac_est *cfg, } ctrl = readl(est_addr + EST_CONTROL); - if (priv->plat->has_xgmac) { + if (priv->plat->core_type == DWMAC_CORE_XGMAC) { ctrl &= ~EST_XGMAC_PTOV; ctrl |= ((NSEC_PER_SEC / ptp_rate) * EST_XGMAC_PTOV_MUL) << EST_XGMAC_PTOV_SHIFT; @@ -148,7 +148,7 @@ static void est_irq_status(struct stmmac_priv *priv, struct net_device *dev, } if (status & EST_BTRE) { - if (priv->plat->has_xgmac) { + if (priv->plat->core_type == DWMAC_CORE_XGMAC) { btrl = FIELD_GET(EST_XGMAC_BTRL, status); btrl_max = FIELD_MAX(EST_XGMAC_BTRL); } else { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index c60cd948311e..df016c4eb710 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -303,9 +303,10 @@ static void stmmac_ethtool_getdrvinfo(struct net_device *dev, { struct stmmac_priv *priv = netdev_priv(dev); - if (priv->plat->has_gmac || priv->plat->has_gmac4) + if (priv->plat->core_type == DWMAC_CORE_GMAC || + priv->plat->core_type == DWMAC_CORE_GMAC4) strscpy(info->driver, GMAC_ETHTOOL_NAME, sizeof(info->driver)); - else if (priv->plat->has_xgmac) + else if (priv->plat->core_type == DWMAC_CORE_XGMAC) strscpy(info->driver, XGMAC_ETHTOOL_NAME, sizeof(info->driver)); else strscpy(info->driver, MAC100_ETHTOOL_NAME, @@ -351,9 +352,9 @@ static int stmmac_ethtool_get_regs_len(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - if (priv->plat->has_xgmac) + if (priv->plat->core_type == DWMAC_CORE_XGMAC) return XGMAC_REGSIZE * 4; - else if (priv->plat->has_gmac4) + else if (priv->plat->core_type == DWMAC_CORE_GMAC4) return GMAC4_REG_SPACE_SIZE; return REG_SPACE_SIZE; } @@ -368,12 +369,12 @@ static void stmmac_ethtool_gregs(struct net_device *dev, stmmac_dump_dma_regs(priv, priv->ioaddr, reg_space); /* Copy DMA registers to where ethtool expects them */ - if (priv->plat->has_gmac4) { + if (priv->plat->core_type == DWMAC_CORE_GMAC4) { /* GMAC4 dumps its DMA registers at its DMA_CHAN_BASE_ADDR */ memcpy(®_space[ETHTOOL_DMA_OFFSET], ®_space[GMAC4_DMA_CHAN_BASE_ADDR / 4], NUM_DWMAC4_DMA_REGS * 4); - } else if (!priv->plat->has_xgmac) { + } else if (priv->plat->core_type != DWMAC_CORE_XGMAC) { memcpy(®_space[ETHTOOL_DMA_OFFSET], ®_space[DMA_BUS_MODE / 4], NUM_DWMAC1000_DMA_REGS * 4); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 5e6aaead5894..9fa3c221a0c3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -446,7 +446,7 @@ static void stmmac_get_rx_hwtstamp(struct stmmac_priv *priv, struct dma_desc *p, if (!priv->hwts_rx_en) return; /* For GMAC4, the valid timestamp is from CTX next desc. */ - if (priv->plat->has_gmac4 || priv->plat->has_xgmac) + if (dwmac_is_xmac(priv->plat->core_type)) desc = np; /* Check if timestamp is available */ @@ -697,7 +697,7 @@ static int stmmac_hwtstamp_get(struct net_device *dev, static int stmmac_init_tstamp_counter(struct stmmac_priv *priv, u32 systime_flags) { - bool xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac; + bool xmac = dwmac_is_xmac(priv->plat->core_type); struct timespec64 now; u32 sec_inc = 0; u64 temp = 0; @@ -746,7 +746,7 @@ static int stmmac_init_tstamp_counter(struct stmmac_priv *priv, */ static int stmmac_init_timestamping(struct stmmac_priv *priv) { - bool xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac; + bool xmac = dwmac_is_xmac(priv->plat->core_type); int ret; if (priv->plat->ptp_clk_freq_config) @@ -2413,7 +2413,7 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) txfifosz = priv->dma_cap.tx_fifo_size; /* Split up the shared Tx/Rx FIFO memory on DW QoS Eth and DW XGMAC */ - if (priv->plat->has_gmac4 || priv->plat->has_xgmac) { + if (dwmac_is_xmac(priv->plat->core_type)) { rxfifosz /= rx_channels_count; txfifosz /= tx_channels_count; } @@ -4520,7 +4520,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) if (skb_is_gso(skb) && priv->tso) { if (gso & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) return stmmac_tso_xmit(skb, dev); - if (priv->plat->has_gmac4 && (gso & SKB_GSO_UDP_L4)) + if (priv->plat->core_type == DWMAC_CORE_GMAC4 && + (gso & SKB_GSO_UDP_L4)) return stmmac_tso_xmit(skb, dev); } @@ -5973,7 +5974,7 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv) u32 queue; bool xmac; - xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac; + xmac = dwmac_is_xmac(priv->plat->core_type); queues_count = (rx_cnt > tx_cnt) ? rx_cnt : tx_cnt; if (priv->irq_wake) @@ -5987,7 +5988,7 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv) stmmac_fpe_irq_status(priv); /* To handle GMAC own interrupts */ - if ((priv->plat->has_gmac) || xmac) { + if (priv->plat->core_type == DWMAC_CORE_GMAC || xmac) { int status = stmmac_host_irq_status(priv, priv->hw, &priv->xstats); if (unlikely(status)) { @@ -6348,7 +6349,7 @@ static int stmmac_dma_cap_show(struct seq_file *seq, void *v) (priv->dma_cap.mbps_1000) ? "Y" : "N"); seq_printf(seq, "\tHalf duplex: %s\n", (priv->dma_cap.half_duplex) ? "Y" : "N"); - if (priv->plat->has_xgmac) { + if (priv->plat->core_type == DWMAC_CORE_XGMAC) { seq_printf(seq, "\tNumber of Additional MAC address registers: %d\n", priv->dma_cap.multi_addr); @@ -6372,7 +6373,7 @@ static int stmmac_dma_cap_show(struct seq_file *seq, void *v) (priv->dma_cap.time_stamp) ? "Y" : "N"); seq_printf(seq, "\tIEEE 1588-2008 Advanced Time Stamp: %s\n", (priv->dma_cap.atime_stamp) ? "Y" : "N"); - if (priv->plat->has_xgmac) + if (priv->plat->core_type == DWMAC_CORE_XGMAC) seq_printf(seq, "\tTimestamp System Time Source: %s\n", dwxgmac_timestamp_source[priv->dma_cap.tssrc]); seq_printf(seq, "\t802.3az - Energy-Efficient Ethernet (EEE): %s\n", @@ -6381,7 +6382,7 @@ static int stmmac_dma_cap_show(struct seq_file *seq, void *v) seq_printf(seq, "\tChecksum Offload in TX: %s\n", (priv->dma_cap.tx_coe) ? "Y" : "N"); if (priv->synopsys_id >= DWMAC_CORE_4_00 || - priv->plat->has_xgmac) { + priv->plat->core_type == DWMAC_CORE_XGMAC) { seq_printf(seq, "\tIP Checksum Offload in RX: %s\n", (priv->dma_cap.rx_coe) ? "Y" : "N"); } else { @@ -7233,8 +7234,9 @@ static int stmmac_hw_init(struct stmmac_priv *priv) * has to be disable and this can be done by passing the * riwt_off field from the platform. */ - if (((priv->synopsys_id >= DWMAC_CORE_3_50) || - (priv->plat->has_xgmac)) && (!priv->plat->riwt_off)) { + if ((priv->synopsys_id >= DWMAC_CORE_3_50 || + priv->plat->core_type == DWMAC_CORE_XGMAC) && + !priv->plat->riwt_off) { priv->use_riwt = 1; dev_info(priv->device, "Enable RX Mitigation via HW Watchdog Timer\n"); @@ -7355,7 +7357,7 @@ static int stmmac_xdp_rx_timestamp(const struct xdp_md *_ctx, u64 *timestamp) return -ENODATA; /* For GMAC4, the valid timestamp is from CTX next desc. */ - if (priv->plat->has_gmac4 || priv->plat->has_xgmac) + if (dwmac_is_xmac(priv->plat->core_type)) desc_contains_ts = ndesc; /* Check if timestamp is available */ @@ -7511,7 +7513,7 @@ int stmmac_dvr_probe(struct device *device, if ((priv->plat->flags & STMMAC_FLAG_TSO_EN) && (priv->dma_cap.tsoen)) { ndev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6; - if (priv->plat->has_gmac4) + if (priv->plat->core_type == DWMAC_CORE_GMAC4) ndev->hw_features |= NETIF_F_GSO_UDP_L4; priv->tso = true; dev_info(priv->device, "TSO feature enabled\n"); @@ -7564,7 +7566,7 @@ int stmmac_dvr_probe(struct device *device, #ifdef STMMAC_VLAN_TAG_USED /* Both mac100 and gmac support receive VLAN tag detection */ ndev->features |= NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX; - if (priv->plat->has_gmac4 || priv->plat->has_xgmac) { + if (dwmac_is_xmac(priv->plat->core_type)) { ndev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX; priv->hw->hw_vlan_en = true; } @@ -7595,7 +7597,7 @@ int stmmac_dvr_probe(struct device *device, /* MTU range: 46 - hw-specific max */ ndev->min_mtu = ETH_ZLEN - ETH_HLEN; - if (priv->plat->has_xgmac) + if (priv->plat->core_type == DWMAC_CORE_XGMAC) ndev->max_mtu = XGMAC_JUMBO_LEN; else if ((priv->plat->enh_desc) || (priv->synopsys_id >= DWMAC_CORE_4_00)) ndev->max_mtu = JUMBO_LEN; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c index 65db43e9c85e..3f8cc3293964 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c @@ -301,7 +301,7 @@ static int stmmac_mdio_read_c22(struct mii_bus *bus, int phyaddr, int phyreg) struct stmmac_priv *priv = netdev_priv(bus->priv); u32 cmd; - if (priv->plat->has_gmac4) + if (priv->plat->core_type == DWMAC_CORE_GMAC4) cmd = MII_GMAC4_READ; else cmd = 0; @@ -344,7 +344,7 @@ static int stmmac_mdio_write_c22(struct mii_bus *bus, int phyaddr, int phyreg, struct stmmac_priv *priv = netdev_priv(bus->priv); u32 cmd; - if (priv->plat->has_gmac4) + if (priv->plat->core_type == DWMAC_CORE_GMAC4) cmd = MII_GMAC4_WRITE; else cmd = MII_ADDR_GWRITE; @@ -417,7 +417,7 @@ int stmmac_mdio_reset(struct mii_bus *bus) * on MDC, so perform a dummy mdio read. To be updated for GMAC4 * if needed. */ - if (!priv->plat->has_gmac4) + if (priv->plat->core_type != DWMAC_CORE_GMAC4) writel(0, priv->ioaddr + mii_address); #endif return 0; @@ -528,7 +528,7 @@ static u32 stmmac_clk_csr_set(struct stmmac_priv *priv) value = 0; } - if (priv->plat->has_xgmac) { + if (priv->plat->core_type == DWMAC_CORE_XGMAC) { if (clk_rate > 400000000) value = 0x5; else if (clk_rate > 350000000) @@ -601,7 +601,7 @@ int stmmac_mdio_register(struct net_device *ndev) new_bus->name = "stmmac"; - if (priv->plat->has_xgmac) { + if (priv->plat->core_type == DWMAC_CORE_XGMAC) { new_bus->read = &stmmac_xgmac2_mdio_read_c22; new_bus->write = &stmmac_xgmac2_mdio_write_c22; new_bus->read_c45 = &stmmac_xgmac2_mdio_read_c45; @@ -622,7 +622,7 @@ int stmmac_mdio_register(struct net_device *ndev) } else { new_bus->read = &stmmac_mdio_read_c22; new_bus->write = &stmmac_mdio_write_c22; - if (priv->plat->has_gmac4) { + if (priv->plat->core_type == DWMAC_CORE_GMAC4) { new_bus->read_c45 = &stmmac_mdio_read_c45; new_bus->write_c45 = &stmmac_mdio_write_c45; } @@ -650,7 +650,7 @@ int stmmac_mdio_register(struct net_device *ndev) } /* Looks like we need a dummy read for XGMAC only and C45 PHYs */ - if (priv->plat->has_xgmac) + if (priv->plat->core_type == DWMAC_CORE_XGMAC) stmmac_xgmac2_mdio_read_c45(new_bus, 0, 0, 0); /* If fixed-link is set, skip PHY scanning */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index 4e3aa611fda8..94b3a3b27270 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -23,7 +23,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat) { /* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */ plat->clk_csr = STMMAC_CSR_20_35M; - plat->has_gmac = 1; + plat->core_type = DWMAC_CORE_GMAC; plat->force_sf_dma_mode = 1; plat->mdio_bus_data->needs_reset = true; @@ -76,7 +76,7 @@ static int snps_gmac5_default_data(struct pci_dev *pdev, int i; plat->clk_csr = STMMAC_CSR_250_300M; - plat->has_gmac4 = 1; + plat->core_type = DWMAC_CORE_GMAC4; plat->force_sf_dma_mode = 1; plat->flags |= STMMAC_FLAG_TSO_EN; plat->pmt = 1; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 27bcaae07a7f..fbb92cc6ab59 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -552,12 +552,12 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) &pdev->dev, plat->unicast_filter_entries); plat->multicast_filter_bins = dwmac1000_validate_mcast_bins( &pdev->dev, plat->multicast_filter_bins); - plat->has_gmac = 1; + plat->core_type = DWMAC_CORE_GMAC; plat->pmt = 1; } if (of_device_is_compatible(np, "snps,dwmac-3.40a")) { - plat->has_gmac = 1; + plat->core_type = DWMAC_CORE_GMAC; plat->enh_desc = 1; plat->tx_coe = 1; plat->bugged_jumbo = 1; @@ -565,8 +565,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) } if (of_device_compatible_match(np, stmmac_gmac4_compats)) { - plat->has_gmac4 = 1; - plat->has_gmac = 0; + plat->core_type = DWMAC_CORE_GMAC4; plat->pmt = 1; if (of_property_read_bool(np, "snps,tso")) plat->flags |= STMMAC_FLAG_TSO_EN; @@ -580,7 +579,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) } if (of_device_is_compatible(np, "snps,dwxgmac")) { - plat->has_xgmac = 1; + plat->core_type = DWMAC_CORE_XGMAC; plat->pmt = 1; if (of_property_read_bool(np, "snps,tso")) plat->flags |= STMMAC_FLAG_TSO_EN; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index 993ff4e87e55..3e30172fa129 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -57,7 +57,7 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta) bool xmac, est_rst = false; int ret; - xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac; + xmac = dwmac_is_xmac(priv->plat->core_type); if (delta < 0) { neg_adj = 1; @@ -344,7 +344,7 @@ void stmmac_ptp_register(struct stmmac_priv *priv) /* Calculate the clock domain crossing (CDC) error if necessary */ priv->plat->cdc_error_adj = 0; - if (priv->plat->has_gmac4) + if (priv->plat->core_type == DWMAC_CORE_GMAC4) priv->plat->cdc_error_adj = (2 * NSEC_PER_SEC) / priv->plat->clk_ptp_rate; /* Update the ptp clock parameters based on feature discovery, when diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 99022620457a..151c81c560c8 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -171,6 +171,13 @@ struct dwmac4_addrs { u32 mtl_low_cred_offset; }; +enum dwmac_core_type { + DWMAC_CORE_MAC100, + DWMAC_CORE_GMAC, + DWMAC_CORE_GMAC4, + DWMAC_CORE_XGMAC, +}; + #define STMMAC_FLAG_SPH_DISABLE BIT(1) #define STMMAC_FLAG_USE_PHY_WOL BIT(2) #define STMMAC_FLAG_HAS_SUN8I BIT(3) @@ -186,6 +193,7 @@ struct dwmac4_addrs { #define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY BIT(13) struct plat_stmmacenet_data { + enum dwmac_core_type core_type; int bus_id; int phy_addr; /* MAC ----- optional PCS ----- SerDes ----- optional PHY ----- Media @@ -219,7 +227,6 @@ struct plat_stmmacenet_data { struct stmmac_dma_cfg *dma_cfg; struct stmmac_safety_feature_cfg *safety_feat_cfg; int clk_csr; - int has_gmac; int enh_desc; int tx_coe; int rx_coe; @@ -282,10 +289,8 @@ struct plat_stmmacenet_data { struct reset_control *stmmac_rst; struct reset_control *stmmac_ahb_rst; struct stmmac_axi *axi; - int has_gmac4; int rss_en; int mac_port_sel_speed; - int has_xgmac; u8 vlan_fail_q; struct pci_dev *pdev; int int_snapshot_num; -- cgit v1.2.3 From 7958b4bb806c1af800ca23c8333a98231b3ab0b1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 22 Oct 2025 15:23:42 +0200 Subject: pinctrl: pinmux: Add missing .function_is_gpio kerneldoc This callback was undocumented, add the docs. Reviewed-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinmux.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index 6db6c3e1ccc2..094bbe2fd6fd 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -35,6 +35,16 @@ struct pinctrl_gpio_range; * name can be used with the generic @pinctrl_ops to retrieve the * actual pins affected. The applicable groups will be returned in * @groups and the number of groups in @num_groups + * @function_is_gpio: determine if the indicated function selector passed + * corresponds to the GPIO function which is used by the accelerated GPIO + * functions @gpio_request_enable, @gpio_disable_free and + * @gpio_set_direction. When the pin control core can properly determine + * if a function is a GPIO function, it is easier to use the @strict mode + * on the pin controller. Since a single function is passed, this is + * only useful on pin controllers that use a specific function for GPIO, + * and that usually presupposes that a one-group-per-pin approach is + * used, so that a single function can be set on a single pin to turn + * it to GPIO mode. * @set_mux: enable a certain muxing function with a certain pin group. The * driver does not need to figure out whether enabling this function * conflicts some other use of the pins in that group, such collisions -- cgit v1.2.3 From 243ce64b2b371cdf2cbc39c9422cb3047cab6de7 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 13 Oct 2025 12:51:57 +0200 Subject: backlight: Do not include in header file The backlight interfaces don't require anything from , so don't include it. Signed-off-by: Thomas Zimmermann Reviewed-by: Daniel Thompson (RISCstar) Reviewed-by: Simona Vetter Link: https://patch.msgid.link/20251013105553.836715-1-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/backlight.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 10e626db7eee..f29a9ef1052e 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -10,7 +10,6 @@ #define _LINUX_BACKLIGHT_H #include -#include #include #include -- cgit v1.2.3 From f6ceec6434b5efff62cecbaa2ff74fc29b96c0c6 Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Tue, 21 Oct 2025 12:09:40 +0200 Subject: net: datagram: introduce datagram_poll_queue for custom receive queues Some protocols using TCP encapsulation (e.g., espintcp, openvpn) deliver userspace-bound packets through a custom skb queue rather than the standard sk_receive_queue. Introduce datagram_poll_queue that accepts an explicit receive queue, and convert datagram_poll into a wrapper around datagram_poll_queue. This allows protocols with custom skb queues to reuse the core polling logic without relying on sk_receive_queue. Cc: Sabrina Dubroca Cc: Antonio Quartulli Signed-off-by: Ralf Lici Reviewed-by: Sabrina Dubroca Reviewed-by: Antonio Quartulli Link: https://patch.msgid.link/20251021100942.195010-2-ralf@mandelbit.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 3 +++ net/core/datagram.c | 44 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 37 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fb3fec9affaa..a7cc3d1f4fd1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4204,6 +4204,9 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err); +__poll_t datagram_poll_queue(struct file *file, struct socket *sock, + struct poll_table_struct *wait, + struct sk_buff_head *rcv_queue); __poll_t datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, diff --git a/net/core/datagram.c b/net/core/datagram.c index cb4b9ef2e4e3..c285c6465923 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -920,21 +920,22 @@ fault: EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** - * datagram_poll - generic datagram poll + * datagram_poll_queue - same as datagram_poll, but on a specific receive + * queue * @file: file struct * @sock: socket * @wait: poll table + * @rcv_queue: receive queue to poll * - * Datagram poll: Again totally generic. This also handles - * sequenced packet sockets providing the socket receive queue - * is only ever holding data ready to receive. + * Performs polling on the given receive queue, handling shutdown, error, + * and connection state. This is useful for protocols that deliver + * userspace-bound packets through a custom queue instead of + * sk->sk_receive_queue. * - * Note: when you *don't* use this routine for this protocol, - * and you use a different write policy from sock_writeable() - * then please supply your own write_space callback. + * Return: poll bitmask indicating the socket's current state */ -__poll_t datagram_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t datagram_poll_queue(struct file *file, struct socket *sock, + poll_table *wait, struct sk_buff_head *rcv_queue) { struct sock *sk = sock->sk; __poll_t mask; @@ -956,7 +957,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(rcv_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ @@ -978,4 +979,27 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, return mask; } +EXPORT_SYMBOL(datagram_poll_queue); + +/** + * datagram_poll - generic datagram poll + * @file: file struct + * @sock: socket + * @wait: poll table + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you *don't* use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + * + * Return: poll bitmask indicating the socket's current state + */ +__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + return datagram_poll_queue(file, sock, wait, + &sock->sk->sk_receive_queue); +} EXPORT_SYMBOL(datagram_poll); -- cgit v1.2.3 From c2afdd73e5ba2146c7e8b43b2607da5d4b720d9d Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Thu, 19 Jun 2025 11:56:19 +0300 Subject: mmc: core: Skip to set the default 200mA SD current limit Let's avoid updating the SD current limit when the maximum power is 200mA (0.72W) or less, as this is already the default value for the SD card. In this way we avoid sending an unnecessary command during initialization. Signed-off-by: Avri Altman Signed-off-by: Ulf Hansson --- drivers/mmc/core/sd.c | 7 ++----- include/linux/mmc/card.h | 1 - 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index b6758911e22c..948948ca9b4a 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -554,7 +554,7 @@ static u32 sd_get_host_max_current(struct mmc_host *host) static int sd_set_current_limit(struct mmc_card *card, u8 *status) { - int current_limit = SD_SET_CURRENT_NO_CHANGE; + int current_limit = SD_SET_CURRENT_LIMIT_200; int err; u32 max_current; @@ -598,11 +598,8 @@ static int sd_set_current_limit(struct mmc_card *card, u8 *status) else if (max_current >= 400 && card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_400) current_limit = SD_SET_CURRENT_LIMIT_400; - else if (max_current >= 200 && - card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_200) - current_limit = SD_SET_CURRENT_LIMIT_200; - if (current_limit != SD_SET_CURRENT_NO_CHANGE) { + if (current_limit != SD_SET_CURRENT_LIMIT_200) { err = mmc_sd_switch(card, SD_SWITCH_SET, 3, current_limit, status); if (err) diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index ddcdf23d731c..e9e964c20e53 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -182,7 +182,6 @@ struct sd_switch_caps { #define SD_SET_CURRENT_LIMIT_400 1 #define SD_SET_CURRENT_LIMIT_600 2 #define SD_SET_CURRENT_LIMIT_800 3 -#define SD_SET_CURRENT_NO_CHANGE (-1) #define SD_MAX_CURRENT_200 (1 << SD_SET_CURRENT_LIMIT_200) #define SD_MAX_CURRENT_400 (1 << SD_SET_CURRENT_LIMIT_400) -- cgit v1.2.3 From b2284768c6b32aa224ca7d0ef0741beb434f03aa Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 22 Oct 2025 11:44:21 +0800 Subject: virtio-net: zero unused hash fields When GSO tunnel is negotiated virtio_net_hdr_tnl_from_skb() tries to initialize the tunnel metadata but forget to zero unused rxhash fields. This may leak information to another side. Fixing this by zeroing the unused hash fields. Acked-by: Michael S. Tsirkin Fixes: a2fb4bc4e2a6a ("net: implement virtio helpers to handle UDP GSO tunneling") Cc: Signed-off-by: Jason Wang Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20251022034421.70244-1-jasowang@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 20e0584db1dd..4d1780848d0e 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -401,6 +401,10 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, if (!tnl_hdr_negotiated) return -EINVAL; + vhdr->hash_hdr.hash_value = 0; + vhdr->hash_hdr.hash_report = 0; + vhdr->hash_hdr.padding = 0; + /* Let the basic parsing deal with plain GSO features. */ skb_shinfo(skb)->gso_type &= ~tnl_gso_type; ret = virtio_net_hdr_from_skb(skb, hdr, true, false, vlan_hlen); -- cgit v1.2.3 From bb65e0c141f879cdf54db11ae446ee3605fb54d5 Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Wed, 22 Oct 2025 15:29:39 +0300 Subject: net/mlx5: Add PPHCR to PCAM supported registers mask Add the PPHCR bit to the port_access_reg_cap_mask field of PCAM register to indicate that the device supports the PPHCR register and the RS-FEC histogram feature. Signed-off-by: Alexei Lazar Reviewed-by: Yael Chemla Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1761136182-918470-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 07614cd95bed..1b0b36aa2a76 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10833,7 +10833,9 @@ struct mlx5_ifc_pcam_regs_5000_to_507f_bits { u8 port_access_reg_cap_mask_127_to_96[0x20]; u8 port_access_reg_cap_mask_95_to_64[0x20]; - u8 port_access_reg_cap_mask_63_to_36[0x1c]; + u8 port_access_reg_cap_mask_63[0x1]; + u8 pphcr[0x1]; + u8 port_access_reg_cap_mask_61_to_36[0x1a]; u8 pplm[0x1]; u8 port_access_reg_cap_mask_34_to_32[0x3]; -- cgit v1.2.3 From 4da42aaa82d6e3fa2e822e6e771d031c2e20a6c7 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Thu, 16 Oct 2025 11:47:54 -0300 Subject: printk: nbcon: Export console_is_usable The helper will be used on KDB code in the next commits. Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Marcos Paulo de Souza Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-1-866aac60a80e@suse.com Signed-off-by: Petr Mladek --- include/linux/console.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ kernel/printk/internal.h | 45 --------------------------------------------- 2 files changed, 45 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 8f10d0a85bb4..5c3a718c22fc 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -605,6 +606,48 @@ extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); + +/* + * Check if the given console is currently capable and allowed to print + * records. Note that this function does not consider the current context, + * which can also play a role in deciding if @con can be used to print + * records. + */ +static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) +{ + if (!(flags & CON_ENABLED)) + return false; + + if ((flags & CON_SUSPENDED)) + return false; + + if (flags & CON_NBCON) { + /* The write_atomic() callback is optional. */ + if (use_atomic && !con->write_atomic) + return false; + + /* + * For the !use_atomic case, @printk_kthreads_running is not + * checked because the write_thread() callback is also used + * via the legacy loop when the printer threads are not + * available. + */ + } else { + if (!con->write) + return false; + } + + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) + return false; + + return true; +} + #else static inline void nbcon_cpu_emergency_enter(void) { } static inline void nbcon_cpu_emergency_exit(void) { } @@ -612,6 +655,8 @@ static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } +static inline bool console_is_usable(struct console *con, short flags, + bool use_atomic) { return false; } #endif extern int console_set_on_cmdline; diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index f72bbfa266d6..7e3128ec9336 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -3,7 +3,6 @@ * internal.h - printk internal definitions */ #include -#include #include #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) @@ -112,47 +111,6 @@ bool nbcon_kthread_create(struct console *con); void nbcon_kthread_stop(struct console *con); void nbcon_kthreads_wake(void); -/* - * Check if the given console is currently capable and allowed to print - * records. Note that this function does not consider the current context, - * which can also play a role in deciding if @con can be used to print - * records. - */ -static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) -{ - if (!(flags & CON_ENABLED)) - return false; - - if ((flags & CON_SUSPENDED)) - return false; - - if (flags & CON_NBCON) { - /* The write_atomic() callback is optional. */ - if (use_atomic && !con->write_atomic) - return false; - - /* - * For the !use_atomic case, @printk_kthreads_running is not - * checked because the write_thread() callback is also used - * via the legacy loop when the printer threads are not - * available. - */ - } else { - if (!con->write) - return false; - } - - /* - * Console drivers may assume that per-cpu resources have been - * allocated. So unless they're explicitly marked as being able to - * cope (CON_ANYTIME) don't call them until this CPU is officially up. - */ - if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) - return false; - - return true; -} - /** * nbcon_kthread_wake - Wake up a console printing thread * @con: Console to operate on @@ -204,9 +162,6 @@ static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *hand static inline void nbcon_kthread_wake(struct console *con) { } static inline void nbcon_kthreads_wake(void) { } -static inline bool console_is_usable(struct console *con, short flags, - bool use_atomic) { return false; } - #endif /* CONFIG_PRINTK */ extern bool have_boot_console; -- cgit v1.2.3 From 49f7d3054e84617395a37a058251c81320a3614a Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Thu, 16 Oct 2025 11:47:55 -0300 Subject: printk: nbcon: Introduce KDB helpers These helpers will be used when calling console->write_atomic on KDB code in the next patch. It's basically the same implementation as nbcon_device_try_acquire, but using NBCON_PRIO_EMERGENCY when acquiring the context. If the acquire succeeds, the message and message length are assigned to nbcon_write_context so ->write_atomic can print the message. After release try to flush the console since there may be a backlog of messages in the ringbuffer. The kthread console printers do not get a chance to run while kdb is active. Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Marcos Paulo de Souza Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-2-866aac60a80e@suse.com Signed-off-by: Petr Mladek --- include/linux/console.h | 6 +++++ kernel/printk/nbcon.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 5c3a718c22fc..9406342b27db 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -606,6 +606,9 @@ extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); +extern bool nbcon_kdb_try_acquire(struct console *con, + struct nbcon_write_context *wctxt); +extern void nbcon_kdb_release(struct nbcon_write_context *wctxt); /* * Check if the given console is currently capable and allowed to print @@ -655,6 +658,9 @@ static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } +static inline bool nbcon_kdb_try_acquire(struct console *con, + struct nbcon_write_context *wctxt) { return false; } +static inline void nbcon_kdb_release(struct console *con) { } static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) { return false; } #endif diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 558ef3177976..e1bf5409cb6b 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -1855,3 +1855,64 @@ void nbcon_device_release(struct console *con) console_srcu_read_unlock(cookie); } EXPORT_SYMBOL_GPL(nbcon_device_release); + +/** + * nbcon_kdb_try_acquire - Try to acquire nbcon console and enter unsafe + * section + * @con: The nbcon console to acquire + * @wctxt: The nbcon write context to be used on success + * + * Context: Under console_srcu_read_lock() for emitting a single kdb message + * using the given con->write_atomic() callback. Can be called + * only when the console is usable at the moment. + * + * Return: True if the console was acquired. False otherwise. + * + * kdb emits messages on consoles registered for printk() without + * storing them into the ring buffer. It has to acquire the console + * ownerhip so that it could call con->write_atomic() callback a safe way. + * + * This function acquires the nbcon console using priority NBCON_PRIO_EMERGENCY + * and marks it unsafe for handover/takeover. + */ +bool nbcon_kdb_try_acquire(struct console *con, + struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->console = con; + ctxt->prio = NBCON_PRIO_EMERGENCY; + + if (!nbcon_context_try_acquire(ctxt, false)) + return false; + + if (!nbcon_context_enter_unsafe(ctxt)) + return false; + + return true; +} + +/** + * nbcon_kdb_release - Exit unsafe section and release the nbcon console + * + * @wctxt: The nbcon write context initialized by a successful + * nbcon_kdb_try_acquire() + */ +void nbcon_kdb_release(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + if (!nbcon_context_exit_unsafe(ctxt)) + return; + + nbcon_context_release(ctxt); + + /* + * Flush any new printk() messages added when the console was blocked. + * Only the console used by the given write context was blocked. + * The console was locked only when the write_atomic() callback + * was usable. + */ + __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb), false); +} -- cgit v1.2.3 From 286b113d70007e932d18aa0acfce1a3f5b25d8d1 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Thu, 16 Oct 2025 11:47:56 -0300 Subject: printk: nbcon: Allow KDB to acquire the NBCON context KDB can interrupt any console to execute the "mirrored printing" at any time, so add an exception to nbcon_context_try_acquire_direct to allow to get the context if the current CPU is the same as kdb_printf_cpu. This change will be necessary for the next patch, which fixes kdb_msg_write to work with NBCON consoles by calling ->write_atomic on such consoles. But to print it first needs to acquire the ownership of the console, so nbcon_context_try_acquire_direct is fixed here. Reviewed-by: John Ogness Signed-off-by: Marcos Paulo de Souza Reviewed-by: Petr Mladek Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-3-866aac60a80e@suse.com [pmladek@suse.com: Fix compilation with !CONFIG_KGDB_KDB.] Signed-off-by: Petr Mladek --- include/linux/kdb.h | 16 ++++++++++++++++ kernel/printk/nbcon.c | 6 +++++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index ecbf819deeca..741c58e86431 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -14,6 +14,7 @@ */ #include +#include /* Shifted versions of the command enable bits are be used if the command * has no arguments (see kdb_check_flags). This allows commands, such as @@ -207,11 +208,26 @@ static inline const char *kdb_walk_kallsyms(loff_t *pos) /* Dynamic kdb shell command registration */ extern int kdb_register(kdbtab_t *cmd); extern void kdb_unregister(kdbtab_t *cmd); + +/* Return true when KDB as locked for printing a message on this CPU. */ +static inline +bool kdb_printf_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because the task could + * not get migrated when KDB has locked for printing on this CPU. + */ + return unlikely(READ_ONCE(kdb_printf_cpu) == raw_smp_processor_id()); +} + #else /* ! CONFIG_KGDB_KDB */ static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; } static inline void kdb_init(int level) {} static inline int kdb_register(kdbtab_t *cmd) { return 0; } static inline void kdb_unregister(kdbtab_t *cmd) {} + +static inline bool kdb_printf_on_this_cpu(void) { return false; } + #endif /* CONFIG_KGDB_KDB */ enum { KDB_NOT_INITIALIZED, diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index e1bf5409cb6b..5be018493909 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -249,13 +250,16 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, * since all non-panic CPUs are stopped during panic(), it * is safer to have them avoid gaining console ownership. * - * If this acquire is a reacquire (and an unsafe takeover + * One exception is when kdb has locked for printing on this CPU. + * + * Second exception is a reacquire (and an unsafe takeover * has not previously occurred) then it is allowed to attempt * a direct acquire in panic. This gives console drivers an * opportunity to perform any necessary cleanup if they were * interrupted by the panic CPU while printing. */ if (panic_on_other_cpu() && + !kdb_printf_on_this_cpu() && (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; } -- cgit v1.2.3 From 4349cf0df34f37d2470d246bc9be8d9836dfa49e Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Thu, 16 Oct 2025 11:47:57 -0300 Subject: printk: nbcon: Export nbcon_write_context_set_buf This function will be used in the next patch to allow a driver to set both the message and message length of a nbcon_write_context. This is necessary because the function also initializes the ->unsafe_takeover struct member. By using this helper we ensure that the struct is initialized correctly. Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Marcos Paulo de Souza Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-4-866aac60a80e@suse.com Signed-off-by: Petr Mladek --- include/linux/console.h | 4 ++++ kernel/printk/nbcon.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 9406342b27db..4585eb8e109e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -603,6 +603,8 @@ static inline bool console_is_registered(const struct console *con) extern void nbcon_cpu_emergency_enter(void); extern void nbcon_cpu_emergency_exit(void); extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); +extern void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); @@ -655,6 +657,8 @@ static inline bool console_is_usable(struct console *con, short flags, bool use_ static inline void nbcon_cpu_emergency_enter(void) { } static inline void nbcon_cpu_emergency_exit(void) { } static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } +static inline void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len) { } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 5be018493909..fdd1cbebe77d 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -854,8 +854,8 @@ out: return nbcon_context_can_proceed(ctxt, &cur); } -static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, - char *buf, unsigned int len) +void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; -- cgit v1.2.3 From 62627bf0cadf6eae87d92fecf604c42160fe16ef Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Thu, 16 Oct 2025 11:47:58 -0300 Subject: kdb: Adapt kdb_msg_write to work with NBCON consoles Function kdb_msg_write was calling con->write for any found console, but it won't work on NBCON consoles. In this case we should acquire the ownership of the console using NBCON_PRIO_EMERGENCY, since printing kdb messages should only be interrupted by a panic. At this point, the console is required to use the atomic callback. The console is skipped if the write_atomic callback is not set or if the context could not be acquired. The validation of NBCON is done by the console_is_usable helper. The context is released right after write_atomic finishes. The oops_in_progress handling is only needed in the legacy consoles, so it was moved around the con->write callback. Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Marcos Paulo de Souza Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-5-866aac60a80e@suse.com [pmladek@suse.com: Fixed compilation with !CONFIG_PRINTK.] Signed-off-by: Petr Mladek --- include/linux/console.h | 2 +- kernel/debug/kdb/kdb_io.c | 47 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 4585eb8e109e..d17f1f525bec 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -664,7 +664,7 @@ static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } static inline bool nbcon_kdb_try_acquire(struct console *con, struct nbcon_write_context *wctxt) { return false; } -static inline void nbcon_kdb_release(struct console *con) { } +static inline void nbcon_kdb_release(struct nbcon_write_context *wctxt) { } static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) { return false; } #endif diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index b12b9db75c1d..61c1690058ed 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -589,24 +589,41 @@ static void kdb_msg_write(const char *msg, int msg_len) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if (!(console_srcu_read_flags(c) & CON_ENABLED)) + short flags = console_srcu_read_flags(c); + + if (!console_is_usable(c, flags, true)) continue; if (c == dbg_io_ops->cons) continue; - if (!c->write) - continue; - /* - * Set oops_in_progress to encourage the console drivers to - * disregard their internal spin locks: in the current calling - * context the risk of deadlock is a bigger problem than risks - * due to re-entering the console driver. We operate directly on - * oops_in_progress rather than using bust_spinlocks() because - * the calls bust_spinlocks() makes on exit are not appropriate - * for this calling context. - */ - ++oops_in_progress; - c->write(c, msg, msg_len); - --oops_in_progress; + + if (flags & CON_NBCON) { + struct nbcon_write_context wctxt = { }; + + /* + * Do not continue if the console is NBCON and the context + * can't be acquired. + */ + if (!nbcon_kdb_try_acquire(c, &wctxt)) + continue; + + nbcon_write_context_set_buf(&wctxt, (char *)msg, msg_len); + + c->write_atomic(c, &wctxt); + nbcon_kdb_release(&wctxt); + } else { + /* + * Set oops_in_progress to encourage the console drivers to + * disregard their internal spin locks: in the current calling + * context the risk of deadlock is a bigger problem than risks + * due to re-entering the console driver. We operate directly on + * oops_in_progress rather than using bust_spinlocks() because + * the calls bust_spinlocks() makes on exit are not appropriate + * for this calling context. + */ + ++oops_in_progress; + c->write(c, msg, msg_len); + --oops_in_progress; + } touch_nmi_watchdog(); } console_srcu_read_unlock(cookie); -- cgit v1.2.3 From 245f14f5fe283c782b16143280f283bee29dbb5f Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Tue, 30 Sep 2025 12:30:55 +0800 Subject: interconnect: Optimize kbps_to_icc() macro The current expansion of kbps_to_icc() introduces unnecessary logic when compiled from a general expression. Rewriting it allows compilers to emit shorter and more efficient code across architectures. For example, with gcc -O2: arm64: old: tst x0, 7 add w1, w0, 7 cset w2, ne cmp w0, 0 csel w0, w1, w0, lt add w0, w2, w0, asr 3 new: add w1, w0, 14 adds w0, w0, 7 csel w0, w1, w0, mi asr w0, w0, 3 x86-64: old: xor eax, eax test dil, 7 lea edx, [rdi+7] setne al test edi, edi cmovns edx, edi sar edx, 3 add eax, edx new: lea eax, [rdi+14] add edi, 7 cmovns eax, edi sar eax, 3 In both cases the old form relies on extra test and compare instructions (tst, test, cmp) combined with conditional moves or sets, while the new form uses fewer instructions by folding the addition and flag update together (adds on arm64, add on x86). This reduces the instruction sequence, prevents multiple evaluations of x when it is an expression or a function call, and keeps the macro simpler. Signed-off-by: Kuan-Wei Chiu Link: https://lore.kernel.org/r/20250930043055.2200322-1-visitorckw@gmail.com Signed-off-by: Georgi Djakov --- include/linux/interconnect.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/interconnect.h b/include/linux/interconnect.h index e4b8808823ad..4b12821528a6 100644 --- a/include/linux/interconnect.h +++ b/include/linux/interconnect.h @@ -16,7 +16,7 @@ #define MBps_to_icc(x) ((x) * 1000) #define GBps_to_icc(x) ((x) * 1000 * 1000) #define bps_to_icc(x) (1) -#define kbps_to_icc(x) ((x) / 8 + ((x) % 8 ? 1 : 0)) +#define kbps_to_icc(x) (((x) + 7) / 8) #define Mbps_to_icc(x) ((x) * 1000 / 8) #define Gbps_to_icc(x) ((x) * 1000 * 1000 / 8) -- cgit v1.2.3 From 70e0a80a1f3580ccf5bc1f34dbb433c67d9d8d00 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 Oct 2025 19:06:51 +0100 Subject: treewide: Remove in_irq() This old alias for in_hardirq() has been marked as deprecated since 2020; remove the stragglers. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251024180654.1691095-1-willy@infradead.org --- drivers/bus/fsl-mc/mc-sys.c | 2 +- drivers/md/dm-vdo/logger.c | 2 +- include/linux/lockdep.h | 2 +- include/linux/preempt.h | 2 -- kernel/bpf/syscall.c | 4 ++-- kernel/time/timer.c | 2 +- lib/locking-selftest.c | 4 ++-- 7 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/bus/fsl-mc/mc-sys.c b/drivers/bus/fsl-mc/mc-sys.c index b22c59d57c8f..31037f41893e 100644 --- a/drivers/bus/fsl-mc/mc-sys.c +++ b/drivers/bus/fsl-mc/mc-sys.c @@ -248,7 +248,7 @@ int mc_send_command(struct fsl_mc_io *mc_io, struct fsl_mc_command *cmd) enum mc_cmd_status status; unsigned long irq_flags = 0; - if (in_irq() && !(mc_io->flags & FSL_MC_IO_ATOMIC_CONTEXT_PORTAL)) + if (in_hardirq() && !(mc_io->flags & FSL_MC_IO_ATOMIC_CONTEXT_PORTAL)) return -EINVAL; if (mc_io->flags & FSL_MC_IO_ATOMIC_CONTEXT_PORTAL) diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c index 3f7dc2cb6b98..76a987ccf926 100644 --- a/drivers/md/dm-vdo/logger.c +++ b/drivers/md/dm-vdo/logger.c @@ -34,7 +34,7 @@ static const char *get_current_interrupt_type(void) if (in_nmi()) return "NMI"; - if (in_irq()) + if (in_hardirq()) return "HI"; if (in_softirq()) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 67964dc4db95..dd634103b014 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -616,7 +616,7 @@ do { \ #define lockdep_assert_in_softirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && \ - (!in_softirq() || in_irq() || in_nmi())); \ + (!in_softirq() || in_hardirq() || in_nmi())); \ } while (0) extern void lockdep_assert_in_softirq_func(void); diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 102202185d7a..d964f965c8ff 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -134,11 +134,9 @@ static __always_inline unsigned char interrupt_context_level(void) /* * The following macros are deprecated and should not be used in new code: - * in_irq() - Obsolete version of in_hardirq() * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled */ -#define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a129746bd6c..6cde6a46babf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2330,7 +2330,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) return; if (audit_enabled == AUDIT_OFF) return; - if (!in_irq() && !irqs_disabled()) + if (!in_hardirq() && !irqs_disabled()) ctx = audit_context(); ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); if (unlikely(!ab)) @@ -2428,7 +2428,7 @@ static void __bpf_prog_put(struct bpf_prog *prog) struct bpf_prog_aux *aux = prog->aux; if (atomic64_dec_and_test(&aux->refcnt)) { - if (in_irq() || irqs_disabled()) { + if (in_hardirq() || irqs_disabled()) { INIT_WORK(&aux->work, bpf_prog_put_deferred); schedule_work(&aux->work); } else { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 553fa469d7cc..282a8e5c05f8 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2472,7 +2472,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK - if (in_irq()) + if (in_hardirq()) irq_work_tick(); #endif sched_tick(); diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index ed99344317f5..d939403331b5 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -202,7 +202,7 @@ static void init_shared_classes(void) local_irq_disable(); \ __irq_enter(); \ lockdep_hardirq_threaded(); \ - WARN_ON(!in_irq()); + WARN_ON(!in_hardirq()); #define HARDIRQ_EXIT() \ __irq_exit(); \ @@ -2512,7 +2512,7 @@ DEFINE_LOCK_GUARD_0(NOTTHREADED_HARDIRQ, do { local_irq_disable(); __irq_enter(); - WARN_ON(!in_irq()); + WARN_ON(!in_hardirq()); } while(0), HARDIRQ_EXIT()) DEFINE_LOCK_GUARD_0(SOFTIRQ, SOFTIRQ_ENTER(), SOFTIRQ_EXIT()) -- cgit v1.2.3 From e30f8e61e2518a837837daa26cda3c8cc30f3226 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Oct 2025 20:43:40 -0400 Subject: tracing: Add a tracepoint verification check at build time If a tracepoint is defined via DECLARE_TRACE() or TRACE_EVENT() but never called (via the trace_() function), its metadata is still around in memory and not discarded. When created via TRACE_EVENT() the situation is worse because the TRACE_EVENT() creates metadata that can be around 5k per trace event. Having unused trace events causes several thousand of wasted bytes. Add a verifier that injects a string of the name of the tracepoint it calls that is added to the discarded section "__tracepoint_check". For every builtin tracepoint, its name (which is saved in the in-memory section "__tracepoint_strings") will have its name also in the "__tracepoint_check" section if it is used. Add a new program that is run on build called tracepoint-update. This is executed on the vmlinux.o before the __tracepoint_check section is discarded (the section is discarded before vmlinux is created). This program will create an array of each string in the __tracepoint_check section and then sort it. Then it will walk the strings in the __tracepoint_strings section and do a binary search to check if its name is in the __tracepoint_check section. If it is not, then it is unused and a warning is printed. Note, this currently only handles tracepoints that are builtin and not in modules. Enabling this currently with a given config produces: warning: tracepoint 'sched_move_numa' is unused. warning: tracepoint 'sched_stick_numa' is unused. warning: tracepoint 'sched_swap_numa' is unused. warning: tracepoint 'pelt_hw_tp' is unused. warning: tracepoint 'pelt_irq_tp' is unused. warning: tracepoint 'rcu_preempt_task' is unused. warning: tracepoint 'rcu_unlock_preempted_task' is unused. warning: tracepoint 'xdp_bulk_tx' is unused. warning: tracepoint 'xdp_redirect_map' is unused. warning: tracepoint 'xdp_redirect_map_err' is unused. warning: tracepoint 'vma_mas_szero' is unused. warning: tracepoint 'vma_store' is unused. warning: tracepoint 'hugepage_set_pmd' is unused. warning: tracepoint 'hugepage_set_pud' is unused. warning: tracepoint 'hugepage_update_pmd' is unused. warning: tracepoint 'hugepage_update_pud' is unused. warning: tracepoint 'block_rq_remap' is unused. warning: tracepoint 'xhci_dbc_handle_event' is unused. warning: tracepoint 'xhci_dbc_handle_transfer' is unused. warning: tracepoint 'xhci_dbc_gadget_ep_queue' is unused. warning: tracepoint 'xhci_dbc_alloc_request' is unused. warning: tracepoint 'xhci_dbc_free_request' is unused. warning: tracepoint 'xhci_dbc_queue_request' is unused. warning: tracepoint 'xhci_dbc_giveback_request' is unused. warning: tracepoint 'tcp_ao_wrong_maclen' is unused. warning: tracepoint 'tcp_ao_mismatch' is unused. warning: tracepoint 'tcp_ao_key_not_found' is unused. warning: tracepoint 'tcp_ao_rnext_request' is unused. warning: tracepoint 'tcp_ao_synack_no_key' is unused. warning: tracepoint 'tcp_ao_snd_sne_update' is unused. warning: tracepoint 'tcp_ao_rcv_sne_update' is unused. Some of the above is totally unused but others are not used due to their "trace_" functions being inside configs, in which case, the defined tracepoints should also be inside those same configs. Others are architecture specific but defined in generic code, where they should either be moved to the architecture or be surrounded by #ifdef for the architectures they are for. This tool could be updated to process modules in the future. I'd like to thank Mathieu Desnoyers for suggesting using strings instead of pointers, as using pointers in vmlinux.o required handling relocations and it required implementing almost a full feature linker to do so. To enable this check, run the build with: make UT=1 Note, when all the existing unused tracepoints are removed from the build, the "UT=1" will be removed and this will always be enabled when tracepoints are configured to warn on any new tracepoints. The reason this isn't always enabled now is because it will introduce a lot of warnings for the current unused tracepoints, and all bisects would end at this commit for those warnings. Link: https://lore.kernel.org/all/20250528114549.4d8a5e03@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Arnd Bergmann Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Nick Desaulniers Cc: Catalin Marinas Cc: Linus Torvalds Cc: Randy Dunlap Cc: Stephen Rothwell Link: https://lore.kernel.org/20251022004452.920728129@kernel.org Suggested-by: Mathieu Desnoyers # for using strings instead of pointers Signed-off-by: Steven Rostedt (Google) --- Makefile | 21 ++++ include/asm-generic/vmlinux.lds.h | 1 + include/linux/tracepoint.h | 11 ++ scripts/Makefile | 3 + scripts/link-vmlinux.sh | 7 ++ scripts/tracepoint-update.c | 232 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 275 insertions(+) create mode 100644 scripts/tracepoint-update.c (limited to 'include/linux') diff --git a/Makefile b/Makefile index d14824792227..9823c20c4278 100644 --- a/Makefile +++ b/Makefile @@ -810,6 +810,25 @@ ifdef CONFIG_FUNCTION_TRACER CC_FLAGS_FTRACE := -pg endif +ifdef CONFIG_TRACEPOINTS +# To check for unused tracepoints (tracepoints that are defined but never +# called), run with: +# +# make UT=1 +# +# Each unused tracepoints can take up to 5KB of memory in the running kernel. +# It is best to remove any that are not used. +# +# This command line option will be removed when all current unused +# tracepoints are removed. + +ifeq ("$(origin UT)", "command line") + WARN_ON_UNUSED_TRACEPOINTS := $(UT) +endif +endif # CONFIG_TRACEPOINTS + +export WARN_ON_UNUSED_TRACEPOINTS + include $(srctree)/arch/$(SRCARCH)/Makefile ifdef need-config @@ -1772,6 +1791,8 @@ help: @echo ' c: extra checks in the configuration stage (Kconfig)' @echo ' e: warnings are being treated as errors' @echo ' Multiple levels can be combined with W=12 or W=123' + @echo ' make UT=1 [targets] Warn if a tracepoint is defined but not used.' + @echo ' [ This will be removed when all current unused tracepoints are eliminated. ]' @$(if $(dtstree), \ echo ' make CHECK_DTBS=1 [targets] Check all generated dtb files against schema'; \ echo ' This can be applied both to "dtbs" and to individual "foo.dtb" targets' ; \ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8a9a2e732a65..c510fb097a8c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -1048,6 +1048,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) *(.no_trim_symbol) \ /* ld.bfd warns about .gnu.version* even when not emitted */ \ *(.gnu.version*) \ + *(__tracepoint_check) \ #define DISCARDS \ /DISCARD/ : { \ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 826ce3f8e1f8..1e53d3626c78 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -221,6 +221,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __do_trace_##name(args); \ } +/* + * When a tracepoint is used, it's name is added to the __tracepoint_check + * section. This section is only used at build time to make sure all + * defined tracepoints are used. It is discarded after the build. + */ +# define TRACEPOINT_CHECK(name) \ + static const char __used __section("__tracepoint_check") __trace_check[] = \ + #name; + /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -270,6 +279,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void __do_trace_##name(proto) \ { \ + TRACEPOINT_CHECK(name) \ if (cond) { \ guard(preempt_notrace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ @@ -289,6 +299,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void __do_trace_##name(proto) \ { \ + TRACEPOINT_CHECK(name) \ guard(rcu_tasks_trace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ } \ diff --git a/scripts/Makefile b/scripts/Makefile index f19624b3ed92..0941e5ce7b57 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -11,8 +11,10 @@ hostprogs-always-$(CONFIG_MODULE_SIG_FORMAT) += sign-file hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS) += rustdoc_test_builder hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS) += rustdoc_test_gen +hostprogs-always-$(CONFIG_TRACEPOINTS) += tracepoint-update sorttable-objs := sorttable.o elf-parse.o +tracepoint-update-objs := tracepoint-update.o elf-parse.o ifneq ($(or $(CONFIG_X86_64),$(CONFIG_X86_32)),) always-$(CONFIG_RUST) += target.json @@ -27,6 +29,7 @@ generate_rust_target-rust := y rustdoc_test_builder-rust := y rustdoc_test_gen-rust := y +HOSTCFLAGS_tracepoint-update.o = -I$(srctree)/tools/include HOSTCFLAGS_elf-parse.o = -I$(srctree)/tools/include HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include HOSTLDLIBS_sorttable = -lpthread diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 433849ff7529..d304029fa6da 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -208,6 +208,13 @@ kallsymso= strip_debug= generate_map= +# Use "make UT=1" to trigger warnings on unused tracepoints +case "${WARN_ON_UNUSED_TRACEPOINTS}" in +*1*) + ${objtree}/scripts/tracepoint-update vmlinux.o + ;; +esac + if is_enabled CONFIG_KALLSYMS; then true > .tmp_vmlinux0.syms kallsyms .tmp_vmlinux0.syms .tmp_vmlinux0.kallsyms diff --git a/scripts/tracepoint-update.c b/scripts/tracepoint-update.c new file mode 100644 index 000000000000..6ec30f39d0ad --- /dev/null +++ b/scripts/tracepoint-update.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "elf-parse.h" + +static Elf_Shdr *check_data_sec; +static Elf_Shdr *tracepoint_data_sec; + +static inline void *get_index(void *start, int entsize, int index) +{ + return start + (entsize * index); +} + +static int compare_strings(const void *a, const void *b) +{ + const char *av = *(const char **)a; + const char *bv = *(const char **)b; + + return strcmp(av, bv); +} + +struct elf_tracepoint { + Elf_Ehdr *ehdr; + const char **array; + int count; +}; + +#define REALLOC_SIZE (1 << 10) +#define REALLOC_MASK (REALLOC_SIZE - 1) + +static int add_string(const char *str, const char ***vals, int *count) +{ + const char **array = *vals; + + if (!(*count & REALLOC_MASK)) { + int size = (*count) + REALLOC_SIZE; + + array = realloc(array, sizeof(char *) * size); + if (!array) { + fprintf(stderr, "Failed memory allocation\n"); + return -1; + } + *vals = array; + } + + array[(*count)++] = str; + return 0; +} + +/** + * for_each_shdr_str - iterator that reads strings that are in an ELF section. + * @len: "int" to hold the length of the current string + * @ehdr: A pointer to the ehdr of the ELF file + * @sec: The section that has the strings to iterate on + * + * This is a for loop that iterates over all the nul terminated strings + * that are in a given ELF section. The variable "str" will hold + * the current string for each iteration and the passed in @len will + * contain the strlen() of that string. + */ +#define for_each_shdr_str(len, ehdr, sec) \ + for (const char *str = (void *)(ehdr) + shdr_offset(sec), \ + *end = str + shdr_size(sec); \ + len = strlen(str), str < end; \ + str += (len) + 1) + + +static void make_trace_array(struct elf_tracepoint *etrace) +{ + Elf_Ehdr *ehdr = etrace->ehdr; + const char **vals = NULL; + int count = 0; + int len; + + etrace->array = NULL; + + /* + * The __tracepoint_check section is filled with strings of the + * names of tracepoints (in tracepoint_strings). Create an array + * that points to each string and then sort the array. + */ + for_each_shdr_str(len, ehdr, check_data_sec) { + if (!len) + continue; + if (add_string(str, &vals, &count) < 0) + return; + } + + /* If CONFIG_TRACEPOINT_VERIFY_USED is not set, there's nothing to do */ + if (!count) + return; + + qsort(vals, count, sizeof(char *), compare_strings); + + etrace->array = vals; + etrace->count = count; +} + +static int find_event(const char *str, void *array, size_t size) +{ + return bsearch(&str, array, size, sizeof(char *), compare_strings) != NULL; +} + +static void check_tracepoints(struct elf_tracepoint *etrace) +{ + Elf_Ehdr *ehdr = etrace->ehdr; + int len; + + if (!etrace->array) + return; + + /* + * The __tracepoints_strings section holds all the names of the + * defined tracepoints. If any of them are not in the + * __tracepoint_check_section it means they are not used. + */ + for_each_shdr_str(len, ehdr, tracepoint_data_sec) { + if (!len) + continue; + if (!find_event(str, etrace->array, etrace->count)) { + fprintf(stderr, "warning: tracepoint '%s' is unused.\n", str); + } + } + + free(etrace->array); +} + +static void *tracepoint_check(struct elf_tracepoint *etrace) +{ + make_trace_array(etrace); + check_tracepoints(etrace); + + return NULL; +} + +static int process_tracepoints(void *addr, char const *const fname) +{ + struct elf_tracepoint etrace = {0}; + Elf_Ehdr *ehdr = addr; + Elf_Shdr *shdr_start; + Elf_Shdr *string_sec; + const char *secstrings; + unsigned int shnum; + unsigned int shstrndx; + int shentsize; + int idx; + int done = 2; + + shdr_start = (Elf_Shdr *)((char *)ehdr + ehdr_shoff(ehdr)); + shentsize = ehdr_shentsize(ehdr); + + shstrndx = ehdr_shstrndx(ehdr); + if (shstrndx == SHN_XINDEX) + shstrndx = shdr_link(shdr_start); + string_sec = get_index(shdr_start, shentsize, shstrndx); + secstrings = (const char *)ehdr + shdr_offset(string_sec); + + shnum = ehdr_shnum(ehdr); + if (shnum == SHN_UNDEF) + shnum = shdr_size(shdr_start); + + for (int i = 0; done && i < shnum; i++) { + Elf_Shdr *shdr = get_index(shdr_start, shentsize, i); + + idx = shdr_name(shdr); + + /* locate the __tracepoint_check in vmlinux */ + if (!strcmp(secstrings + idx, "__tracepoint_check")) { + check_data_sec = shdr; + done--; + } + + /* locate the __tracepoints_ptrs section in vmlinux */ + if (!strcmp(secstrings + idx, "__tracepoints_strings")) { + tracepoint_data_sec = shdr; + done--; + } + } + + if (!check_data_sec) { + fprintf(stderr, "no __tracepoint_check in file: %s\n", fname); + return -1; + } + + if (!tracepoint_data_sec) { + fprintf(stderr, "no __tracepoint_strings in file: %s\n", fname); + return -1; + } + + etrace.ehdr = ehdr; + tracepoint_check(&etrace); + return 0; +} + +int main(int argc, char *argv[]) +{ + int n_error = 0; + size_t size = 0; + void *addr = NULL; + + if (argc < 2) { + fprintf(stderr, "usage: tracepoint-update vmlinux...\n"); + return 0; + } + + /* Process each file in turn, allowing deep failure. */ + for (int i = 1; i < argc; i++) { + addr = elf_map(argv[i], &size, 1 << ET_REL); + if (!addr) { + ++n_error; + continue; + } + + if (process_tracepoints(addr, argv[i])) + ++n_error; + + elf_unmap(addr, size); + } + + return !!n_error; +} -- cgit v1.2.3 From faf938153cad98d97f60ac835ead1db74961507e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Oct 2025 20:43:41 -0400 Subject: tracepoint: Do not warn for unused event that is exported There are a few generic events that may only be used by modules. They are defined and then set with EXPORT_TRACEPOINT*(). Mark events that are exported as being used, even though they still waste memory in the kernel proper. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Arnd Bergmann Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Nick Desaulniers Cc: Catalin Marinas Cc: Linus Torvalds Cc: Randy Dunlap Cc: Stephen Rothwell Link: https://lore.kernel.org/20251022004453.089254920@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 1e53d3626c78..8a56f3278b1b 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -227,8 +227,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * defined tracepoints are used. It is discarded after the build. */ # define TRACEPOINT_CHECK(name) \ - static const char __used __section("__tracepoint_check") __trace_check[] = \ - #name; + static const char __used __section("__tracepoint_check") \ + __trace_check_##name[] = #name; /* * Make sure the alignment of the structure in the __tracepoints section will @@ -382,10 +382,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args)); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ + TRACEPOINT_CHECK(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name); \ EXPORT_SYMBOL_GPL(__traceiter_##name); \ EXPORT_STATIC_CALL_GPL(tp_func_##name) #define EXPORT_TRACEPOINT_SYMBOL(name) \ + TRACEPOINT_CHECK(name) \ EXPORT_SYMBOL(__tracepoint_##name); \ EXPORT_SYMBOL(__traceiter_##name); \ EXPORT_STATIC_CALL(tp_func_##name) -- cgit v1.2.3 From 330ce8ffc1848cbfa3e06c2c22750cfffa115579 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 23 Oct 2025 10:16:30 +0100 Subject: net: phy: add phy_can_wakeup() Add phy_can_wakeup() to report whether the PHY driver has marked the PHY device as being wake-up capable as far as the driver model is concerned. Reviewed-by: Maxime Chevallier Reviewed-by: Florian Fainelli Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vBrQs-0000000BLzI-0w3U@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 3c7634482356..3eeeaec52832 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1379,6 +1379,18 @@ static inline void phy_disable_eee_mode(struct phy_device *phydev, u32 link_mode linkmode_clear_bit(link_mode, phydev->advertising_eee); } +/** + * phy_can_wakeup() - indicate whether PHY has driver model wakeup capabilities + * @phydev: The phy_device struct + * + * Returns: true/false depending on the PHY driver's device_set_wakeup_capable() + * setting. + */ +static inline bool phy_can_wakeup(struct phy_device *phydev) +{ + return device_can_wakeup(&phydev->mdio.dev); +} + void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); -- cgit v1.2.3 From b344bfacf1de2dd776a218ce8341b9c672745a01 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 23 Oct 2025 10:16:35 +0100 Subject: net: phy: add phy_may_wakeup() Add phy_may_wakeup() which uses the driver model's device_may_wakeup() when the PHY driver has marked the device as wakeup capable in the driver model, otherwise use phy_drv_wol_enabled(). Replace the sites that used to call phy_drv_wol_enabled() with this as checking the driver model will be more efficient than checking the WoL state. Export phy_may_wakeup() so that phylink can use it. Reviewed-by: Maxime Chevallier Reviewed-by: Florian Fainelli Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vBrQx-0000000BLzO-1RLt@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 14 ++++++++++++-- include/linux/phy.h | 9 +++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 7a67c900e79a..b7feaf0cb1df 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -251,6 +251,16 @@ static bool phy_drv_wol_enabled(struct phy_device *phydev) return wol.wolopts != 0; } +bool phy_may_wakeup(struct phy_device *phydev) +{ + /* If the PHY is using driver-model based wakeup, use that state. */ + if (phy_can_wakeup(phydev)) + return device_may_wakeup(&phydev->mdio.dev); + + return phy_drv_wol_enabled(phydev); +} +EXPORT_SYMBOL_GPL(phy_may_wakeup); + static void phy_link_change(struct phy_device *phydev, bool up) { struct net_device *netdev = phydev->attached_dev; @@ -302,7 +312,7 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) /* If the PHY on the mido bus is not attached but has WOL enabled * we cannot suspend the PHY. */ - if (!netdev && phy_drv_wol_enabled(phydev)) + if (!netdev && phy_may_wakeup(phydev)) return false; /* PHY not attached? May suspend if the PHY has not already been @@ -1909,7 +1919,7 @@ int phy_suspend(struct phy_device *phydev) if (phydev->suspended || !phydrv) return 0; - phydev->wol_enabled = phy_drv_wol_enabled(phydev) || + phydev->wol_enabled = phy_may_wakeup(phydev) || (netdev && netdev->ethtool->wol_enabled); /* If the device has WOL enabled, we cannot suspend the PHY */ if (phydev->wol_enabled && !(phydrv->flags & PHY_ALWAYS_CALL_SUSPEND)) diff --git a/include/linux/phy.h b/include/linux/phy.h index 3eeeaec52832..17a2cdc9f1a0 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1391,6 +1391,15 @@ static inline bool phy_can_wakeup(struct phy_device *phydev) return device_can_wakeup(&phydev->mdio.dev); } +/** + * phy_may_wakeup() - indicate whether PHY has wakeup enabled + * @phydev: The phy_device struct + * + * Returns: true/false depending on the PHY driver's device_set_wakeup_enabled() + * setting if using the driver model, otherwise the legacy determination. + */ +bool phy_may_wakeup(struct phy_device *phydev); + void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); -- cgit v1.2.3 From b79fbd86c84918790c128e6899b420de4667018e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 23 Oct 2025 10:16:40 +0100 Subject: net: phylink: add phylink managed MAC Wake-on-Lan support Add core phylink managed Wake-on-Lan support, which is enabled when the MAC driver fills in the new .mac_wol_set() method that this commit creates. When this feature is disabled, phylink acts as it has in the past, merely passing the ethtool WoL calls to phylib whenever a PHY exists. No other new functionality provided by this commit is enabled. When this feature is enabled, a more inteligent approach is used. Phylink will first pass WoL options to the PHY, read them back, and attempt to set any options that were not set at the PHY at the MAC. Since we have PHY drivers that report they support WoL, and accept WoL configuration even though they aren't wired up to be capable of waking the system, we need a way to differentiate between PHYs that think they support WoL and those which actually do. As PHY drivers do not make use of the driver model's wake-up infrastructure, but could, we use this to determine whether PHY drivers can participate. This gives a path forward where, as MAC drivers are converted to this, it encourages PHY drivers to also be converted. Phylink will also ignore the mac_wol argument to phylink_suspend() as it now knows the WoL state at the MAC. MAC drivers are expected to record/configure the Wake-on-Lan state in their .mac_set_wol() method, and deal appropriately with it in their suspend/resume methods. The driver model provides assistance to set the IRQ wake support which may assist driver authors in achieving the necessary configuration. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vBrR2-0000000BLzU-1xYL@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 80 ++++++++++++++++++++++++++++++++++++++++++++--- include/linux/phylink.h | 26 +++++++++++++++ 2 files changed, 102 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 9d7799ea1c17..bec44ebdf80b 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -93,6 +93,9 @@ struct phylink { u8 sfp_port; struct eee_config eee_cfg; + + u32 wolopts_mac; + u8 wol_sopass[SOPASS_MAX]; }; #define phylink_printk(level, pl, fmt, ...) \ @@ -2562,6 +2565,17 @@ void phylink_rx_clk_stop_unblock(struct phylink *pl) } EXPORT_SYMBOL_GPL(phylink_rx_clk_stop_unblock); +static bool phylink_mac_supports_wol(struct phylink *pl) +{ + return !!pl->mac_ops->mac_wol_set; +} + +static bool phylink_phy_supports_wol(struct phylink *pl, + struct phy_device *phydev) +{ + return phydev && (pl->config->wol_phy_legacy || phy_can_wakeup(phydev)); +} + /** * phylink_suspend() - handle a network device suspend event * @pl: a pointer to a &struct phylink returned from phylink_create() @@ -2575,11 +2589,17 @@ EXPORT_SYMBOL_GPL(phylink_rx_clk_stop_unblock); * can also bring down the link between the MAC and PHY. * - If Wake-on-Lan is active, but being handled by the MAC, the MAC * still needs to receive packets, so we can not bring the link down. + * + * Note: when phylink managed Wake-on-Lan is in use, @mac_wol is ignored. + * (struct phylink_mac_ops.mac_set_wol populated.) */ void phylink_suspend(struct phylink *pl, bool mac_wol) { ASSERT_RTNL(); + if (phylink_mac_supports_wol(pl)) + mac_wol = !!pl->wolopts_mac; + if (mac_wol && (!pl->netdev || pl->netdev->ethtool->wol_enabled)) { /* Wake-on-Lan enabled, MAC handling */ mutex_lock(&pl->state_mutex); @@ -2689,8 +2709,24 @@ void phylink_ethtool_get_wol(struct phylink *pl, struct ethtool_wolinfo *wol) wol->supported = 0; wol->wolopts = 0; - if (pl->phydev) - phy_ethtool_get_wol(pl->phydev, wol); + if (phylink_mac_supports_wol(pl)) { + if (phylink_phy_supports_wol(pl, pl->phydev)) + phy_ethtool_get_wol(pl->phydev, wol); + + /* Where the MAC augments the WoL support, merge its support and + * current configuration. + */ + if (~wol->wolopts & pl->wolopts_mac & WAKE_MAGICSECURE) + memcpy(wol->sopass, pl->wol_sopass, + sizeof(wol->sopass)); + + wol->supported |= pl->config->wol_mac_support; + wol->wolopts |= pl->wolopts_mac; + } else { + /* Legacy */ + if (pl->phydev) + phy_ethtool_get_wol(pl->phydev, wol); + } } EXPORT_SYMBOL_GPL(phylink_ethtool_get_wol); @@ -2707,12 +2743,48 @@ EXPORT_SYMBOL_GPL(phylink_ethtool_get_wol); */ int phylink_ethtool_set_wol(struct phylink *pl, struct ethtool_wolinfo *wol) { + struct ethtool_wolinfo w = { .cmd = ETHTOOL_GWOL }; int ret = -EOPNOTSUPP; + bool changed; + u32 wolopts; ASSERT_RTNL(); - if (pl->phydev) - ret = phy_ethtool_set_wol(pl->phydev, wol); + if (phylink_mac_supports_wol(pl)) { + wolopts = wol->wolopts; + + if (phylink_phy_supports_wol(pl, pl->phydev)) { + ret = phy_ethtool_set_wol(pl->phydev, wol); + if (ret != 0 && ret != -EOPNOTSUPP) + return ret; + + phy_ethtool_get_wol(pl->phydev, &w); + + /* Any Wake-on-Lan modes which the PHY is handling + * should not be passed on to the MAC. + */ + wolopts &= ~w.wolopts; + } + + wolopts &= pl->config->wol_mac_support; + changed = pl->wolopts_mac != wolopts; + if (wolopts & WAKE_MAGICSECURE) + changed |= !!memcmp(wol->sopass, pl->wol_sopass, + sizeof(wol->sopass)); + memcpy(pl->wol_sopass, wol->sopass, sizeof(pl->wol_sopass)); + + if (changed) { + ret = pl->mac_ops->mac_wol_set(pl->config, wolopts, + wol->sopass); + if (!ret) + pl->wolopts_mac = wolopts; + } else { + ret = 0; + } + } else { + if (pl->phydev) + ret = phy_ethtool_set_wol(pl->phydev, wol); + } return ret; } diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 9af0411761d7..59cb58b29d1d 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -156,6 +156,8 @@ enum phylink_op_type { * @lpi_capabilities: MAC speeds which can support LPI signalling * @lpi_timer_default: Default EEE LPI timer setting. * @eee_enabled_default: If set, EEE will be enabled by phylink at creation time + * @wol_phy_legacy: Use Wake-on-Lan with PHY even if phy_can_wakeup() is false + * @wol_mac_support: Bitmask of MAC supported %WAKE_* options */ struct phylink_config { struct device *dev; @@ -173,6 +175,10 @@ struct phylink_config { unsigned long lpi_capabilities; u32 lpi_timer_default; bool eee_enabled_default; + + /* Wake-on-Lan support */ + bool wol_phy_legacy; + u32 wol_mac_support; }; void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed); @@ -188,6 +194,7 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed); * @mac_link_up: allow the link to come up. * @mac_disable_tx_lpi: disable LPI. * @mac_enable_tx_lpi: enable and configure LPI. + * @mac_wol_set: configure Wake-on-Lan settings at the MAC. * * The individual methods are described more fully below. */ @@ -211,6 +218,9 @@ struct phylink_mac_ops { void (*mac_disable_tx_lpi)(struct phylink_config *config); int (*mac_enable_tx_lpi)(struct phylink_config *config, u32 timer, bool tx_clk_stop); + + int (*mac_wol_set)(struct phylink_config *config, u32 wolopts, + const u8 *sopass); }; #if 0 /* For kernel-doc purposes only. */ @@ -440,6 +450,22 @@ void mac_disable_tx_lpi(struct phylink_config *config); */ int mac_enable_tx_lpi(struct phylink_config *config, u32 timer, bool tx_clk_stop); + +/** + * mac_wol_set() - configure the Wake-on-Lan parameters + * @config: a pointer to a &struct phylink_config. + * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes. + * @sopass: SecureOn(tm) password; meaningful only for %WAKE_MAGICSECURE + * + * Enable the specified Wake-on-Lan options at the MAC. Options that the + * PHY can handle will have been removed from @wolopts. + * + * The presence of this method enables phylink-managed WoL support. + * + * Returns: 0 on success. + */ +int (*mac_wol_set)(struct phylink_config *config, u32 wolopts, + const u8 *sopass); #endif struct phylink_pcs_ops; -- cgit v1.2.3 From dc1a2a9ce5b2c80e02115ff6fb29b726ad9d7777 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 23 Oct 2025 10:16:45 +0100 Subject: net: phylink: add phylink managed wake-on-lan PHY speed control Some drivers, e.g. stmmac, use the speed_up()/speed_down() APIs to gain additional power saving during Wake-on-LAN where the PHY is managing the state. Add support to phylink for this, which can be enabled by the MAC driver. Only change the PHY speed if the PHY is configured for wake-up, but without any wake-up on the MAC side, as MAC side means changing the configuration once the negotiation has completed. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vBrR7-0000000BLza-2PjK@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 12 ++++++++++++ include/linux/phylink.h | 2 ++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index bec44ebdf80b..6e1243bf68aa 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -2576,6 +2576,12 @@ static bool phylink_phy_supports_wol(struct phylink *pl, return phydev && (pl->config->wol_phy_legacy || phy_can_wakeup(phydev)); } +static bool phylink_phy_pm_speed_ctrl(struct phylink *pl) +{ + return pl->config->wol_phy_speed_ctrl && !pl->wolopts_mac && + pl->phydev && phy_may_wakeup(pl->phydev); +} + /** * phylink_suspend() - handle a network device suspend event * @pl: a pointer to a &struct phylink returned from phylink_create() @@ -2625,6 +2631,9 @@ void phylink_suspend(struct phylink *pl, bool mac_wol) } else { phylink_stop(pl); } + + if (phylink_phy_pm_speed_ctrl(pl)) + phylink_speed_down(pl, false); } EXPORT_SYMBOL_GPL(phylink_suspend); @@ -2664,6 +2673,9 @@ void phylink_resume(struct phylink *pl) { ASSERT_RTNL(); + if (phylink_phy_pm_speed_ctrl(pl)) + phylink_speed_up(pl); + if (test_bit(PHYLINK_DISABLE_MAC_WOL, &pl->phylink_disable_state)) { /* Wake-on-Lan enabled, MAC handling */ diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 59cb58b29d1d..38363e566ac3 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -157,6 +157,7 @@ enum phylink_op_type { * @lpi_timer_default: Default EEE LPI timer setting. * @eee_enabled_default: If set, EEE will be enabled by phylink at creation time * @wol_phy_legacy: Use Wake-on-Lan with PHY even if phy_can_wakeup() is false + * @wol_phy_speed_ctrl: Use phy speed control on suspend/resume * @wol_mac_support: Bitmask of MAC supported %WAKE_* options */ struct phylink_config { @@ -178,6 +179,7 @@ struct phylink_config { /* Wake-on-Lan support */ bool wol_phy_legacy; + bool wol_phy_speed_ctrl; u32 wol_mac_support; }; -- cgit v1.2.3 From eea31f21dce10814e34dc7ef7ed5136269c7bb59 Mon Sep 17 00:00:00 2001 From: Adithya Jayachandran Date: Wed, 15 Oct 2025 18:40:55 -0700 Subject: {rdma,net}/mlx5: Query vports mac address from device Before this patch during either switchdev or legacy mode enablement we cleared the mac address of vports between changes. This change allows us to preserve the vports mac address between eswitch mode changes. Vports hold information for VFs/SFs such as the permanent mac address. VF/SF mac can be set either by iproute vf interface or devlink function interface. For no obvious reason we reset it to 0 on switchdev/legacy mode changes, this patch is fixing that, to align with other vport information that are never reset, e.g GUID,mtu,promisc mode, etc .. Signed-off-by: Adithya Jayachandran Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch Acked-by: Leon Romanovsky # RDMA --- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 20 +++++++----------- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/vport.c | 24 +++++++++++----------- include/linux/mlx5/vport.h | 3 ++- 5 files changed, 25 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fc1e86f6c409..90daa58126f4 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -842,7 +842,7 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, break; case MLX5_VPORT_ACCESS_METHOD_NIC: - err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); + err = mlx5_query_nic_vport_node_guid(dev->mdev, 0, false, &tmp); break; default: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index e2ffb87b94cb..25af8bd7f077 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -875,13 +875,10 @@ static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) vport_num, 1, vport->info.link_state); - /* Host PF has its own mac/guid. */ - if (vport_num) { - mlx5_modify_nic_vport_mac_address(esw->dev, vport_num, - vport->info.mac); - mlx5_modify_nic_vport_node_guid(esw->dev, vport_num, - vport->info.node_guid); - } + mlx5_query_nic_vport_mac_address(esw->dev, vport_num, true, + vport->info.mac); + mlx5_query_nic_vport_node_guid(esw->dev, vport_num, true, + &vport->info.node_guid); flags = (vport->info.vlan || vport->info.qos) ? SET_VLAN_STRIP | SET_VLAN_INSERT : 0; @@ -947,12 +944,6 @@ int mlx5_esw_vport_enable(struct mlx5_eswitch *esw, struct mlx5_vport *vport, goto err_vhca_mapping; } - /* External controller host PF has factory programmed MAC. - * Read it from the device. - */ - if (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) - mlx5_query_nic_vport_mac_address(esw->dev, vport_num, true, vport->info.mac); - esw_vport_change_handle_locked(vport); esw->enabled_vports++; @@ -2235,6 +2226,9 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw, ivi->vf = vport - 1; mutex_lock(&esw->state_lock); + + mlx5_query_nic_vport_mac_address(esw->dev, vport, true, + evport->info.mac); ether_addr_copy(ivi->mac, evport->info.mac); ivi->linkstate = evport->info.link_state; ivi->vlan = evport->info.vlan; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 4cf995be127d..880e238497b1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -4303,6 +4303,9 @@ int mlx5_devlink_port_fn_hw_addr_get(struct devlink_port *port, struct mlx5_vport *vport = mlx5_devlink_port_vport_get(port); mutex_lock(&esw->state_lock); + + mlx5_query_nic_vport_mac_address(esw->dev, vport->vport, true, + vport->info.mac); ether_addr_copy(hw_addr, vport->info.mac); *hw_addr_len = ETH_ALEN; mutex_unlock(&esw->state_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 2ed2e530b07d..d1483f66cd0c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -78,15 +78,14 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, } static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, - u32 *out) + bool other_vport, u32 *out) { u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {}; MLX5_SET(query_nic_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); - if (vport) - MLX5_SET(query_nic_vport_context_in, in, other_vport, 1); + MLX5_SET(query_nic_vport_context_in, in, other_vport, other_vport); return mlx5_cmd_exec_inout(mdev, query_nic_vport_context, in, out); } @@ -97,7 +96,7 @@ int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {}; int err; - err = mlx5_query_nic_vport_context(mdev, vport, out); + err = mlx5_query_nic_vport_context(mdev, vport, vport > 0, out); if (!err) *min_inline = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.min_wqe_inline_mode); @@ -219,7 +218,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, false, out); if (!err) *mtu = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.mtu); @@ -429,7 +428,7 @@ int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, false, out); if (err) goto out; @@ -451,7 +450,7 @@ int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, false, out); if (err) goto out; @@ -462,7 +461,8 @@ out: return err; } -int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid) +int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, + u16 vport, bool other_vport, u64 *node_guid) { u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); @@ -472,7 +472,7 @@ int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, vport, other_vport, out); if (err) goto out; @@ -529,7 +529,7 @@ int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, false, out); if (err) goto out; @@ -804,7 +804,7 @@ int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, vport, out); + err = mlx5_query_nic_vport_context(mdev, vport, vport > 0, out); if (err) goto out; @@ -908,7 +908,7 @@ int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, false, out); if (err) goto out; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index c87b9507cfa1..f876bfc0669c 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -73,7 +73,8 @@ int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group); -int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); +int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, + u16 vport, bool other_vport, u64 *node_guid); int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, u16 vport, u64 node_guid); int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, -- cgit v1.2.3 From f864e4b721e386be132cc973eadefe5d52cdfd94 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 15 Oct 2025 20:26:07 +0100 Subject: clk: renesas: rzv2h: Add support for DSI clocks Add support for PLLDSI and its post-dividers in the RZ/V2H CPG driver and export helper APIs for use by the DSI driver. Introduce per-PLL-DSI state in the CPG private structure and provide a set of helper functions that find valid PLL parameter combinations for a requested frequency. The new helpers are rzv2h_get_pll_pars(), rzv2h_get_pll_div_pars(), rzv2h_get_pll_divs_pars() and rzv2h_get_pll_dtable_pars() and they are exported in the "RZV2H_CPG" namespace for use by other consumers (notably the DSI driver). These helpers perform iterative searches over PLL parameters (M, K, P, S) and optional post-dividers and return the best match (or an exact match when possible). Move PLL/CLK related limits and parameter types into the shared include (include/linux/clk/renesas.h) by adding struct rzv2h_pll_limits, struct rzv2h_pll_pars and struct rzv2h_pll_div_pars plus the RZV2H_CPG_PLL_DSI_LIMITS() helper macro to define DSI PLL limits. This change centralises the PLLDSI algorithms so the CPG and DSI drivers compute PLL parameters consistently and allows the DSI driver to accurately request rates and program its PLL. Co-developed-by: Fabrizio Castro Signed-off-by: Fabrizio Castro Signed-off-by: Lad Prabhakar Acked-by: Tomi Valkeinen Reviewed-by: Geert Uytterhoeven Link: https://patch.msgid.link/20251015192611.241920-4-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/clk/renesas/rzv2h-cpg.c | 497 ++++++++++++++++++++++++++++++++++++++++ drivers/clk/renesas/rzv2h-cpg.h | 19 +- include/linux/clk/renesas.h | 145 ++++++++++++ 3 files changed, 659 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/renesas/rzv2h-cpg.c b/drivers/clk/renesas/rzv2h-cpg.c index 6abac15d3475..182437800329 100644 --- a/drivers/clk/renesas/rzv2h-cpg.c +++ b/drivers/clk/renesas/rzv2h-cpg.c @@ -14,9 +14,14 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include #include #include #include @@ -26,6 +31,7 @@ #include #include #include +#include #include @@ -47,7 +53,9 @@ #define CPG_PLL_STBY(x) ((x)) #define CPG_PLL_STBY_RESETB BIT(0) +#define CPG_PLL_STBY_SSC_EN BIT(2) #define CPG_PLL_STBY_RESETB_WEN BIT(16) +#define CPG_PLL_STBY_SSC_EN_WEN BIT(18) #define CPG_PLL_CLK1(x) ((x) + 0x004) #define CPG_PLL_CLK1_KDIV GENMASK(31, 16) #define CPG_PLL_CLK1_MDIV GENMASK(15, 6) @@ -65,6 +73,22 @@ #define CPG_CLKSTATUS0 (0x700) +/* On RZ/G3E SoC we have two DSI PLLs */ +#define MAX_CPG_DSI_PLL 2 + +/** + * struct rzv2h_pll_dsi_info - PLL DSI information, holds the limits and parameters + * + * @pll_dsi_limits: PLL DSI parameters limits + * @pll_dsi_parameters: Calculated PLL DSI parameters + * @req_pll_dsi_rate: Requested PLL DSI rate + */ +struct rzv2h_pll_dsi_info { + const struct rzv2h_pll_limits *pll_dsi_limits; + struct rzv2h_pll_div_pars pll_dsi_parameters; + unsigned long req_pll_dsi_rate; +}; + /** * struct rzv2h_cpg_priv - Clock Pulse Generator Private Data * @@ -80,6 +104,7 @@ * @ff_mod_status_ops: Fixed Factor Module Status Clock operations * @mstop_count: Array of mstop values * @rcdev: Reset controller entity + * @pll_dsi_info: Array of PLL DSI information, holds the limits and parameters */ struct rzv2h_cpg_priv { struct device *dev; @@ -98,6 +123,8 @@ struct rzv2h_cpg_priv { atomic_t *mstop_count; struct reset_controller_dev rcdev; + + struct rzv2h_pll_dsi_info pll_dsi_info[MAX_CPG_DSI_PLL]; }; #define rcdev_to_priv(x) container_of(x, struct rzv2h_cpg_priv, rcdev) @@ -168,6 +195,460 @@ struct rzv2h_ff_mod_status_clk { #define to_rzv2h_ff_mod_status_clk(_hw) \ container_of(_hw, struct rzv2h_ff_mod_status_clk, fix.hw) +/** + * struct rzv2h_plldsi_div_clk - PLL DSI DDIV clock + * + * @dtable: divider table + * @priv: CPG private data + * @hw: divider clk + * @ddiv: divider configuration + */ +struct rzv2h_plldsi_div_clk { + const struct clk_div_table *dtable; + struct rzv2h_cpg_priv *priv; + struct clk_hw hw; + struct ddiv ddiv; +}; + +#define to_plldsi_div_clk(_hw) \ + container_of(_hw, struct rzv2h_plldsi_div_clk, hw) + +#define RZ_V2H_OSC_CLK_IN_MEGA (24 * MEGA) +#define RZV2H_MAX_DIV_TABLES (16) + +/** + * rzv2h_get_pll_pars - Finds the best combination of PLL parameters + * for a given frequency. + * + * @limits: Pointer to the structure containing the limits for the PLL parameters + * @pars: Pointer to the structure where the best calculated PLL parameters values + * will be stored + * @freq_millihz: Target output frequency in millihertz + * + * This function calculates the best set of PLL parameters (M, K, P, S) to achieve + * the desired frequency. + * There is no direct formula to calculate the PLL parameters, as it's an open + * system of equations, therefore this function uses an iterative approach to + * determine the best solution. The best solution is one that minimizes the error + * (desired frequency - actual frequency). + * + * Return: true if a valid set of parameters values is found, false otherwise. + */ +bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_pars *pars, u64 freq_millihz) +{ + u64 fout_min_millihz = mul_u32_u32(limits->fout.min, MILLI); + u64 fout_max_millihz = mul_u32_u32(limits->fout.max, MILLI); + struct rzv2h_pll_pars p, best; + + if (freq_millihz > fout_max_millihz || + freq_millihz < fout_min_millihz) + return false; + + /* Initialize best error to maximum possible value */ + best.error_millihz = S64_MAX; + + for (p.p = limits->p.min; p.p <= limits->p.max; p.p++) { + u32 fref = RZ_V2H_OSC_CLK_IN_MEGA / p.p; + u16 divider; + + for (divider = 1 << limits->s.min, p.s = limits->s.min; + p.s <= limits->s.max; p.s++, divider <<= 1) { + for (p.m = limits->m.min; p.m <= limits->m.max; p.m++) { + u64 output_m, output_k_range; + s64 pll_k, output_k; + u64 fvco, output; + + /* + * The frequency generated by the PLL + divider + * is calculated as follows: + * + * With: + * Freq = Ffout = Ffvco / 2^(pll_s) + * Ffvco = (pll_m + (pll_k / 65536)) * Ffref + * Ffref = 24MHz / pll_p + * + * Freq can also be rewritten as: + * Freq = Ffvco / 2^(pll_s) + * = ((pll_m + (pll_k / 65536)) * Ffref) / 2^(pll_s) + * = (pll_m * Ffref) / 2^(pll_s) + ((pll_k / 65536) * Ffref) / 2^(pll_s) + * = output_m + output_k + * + * Every parameter has been determined at this + * point, but pll_k. + * + * Considering that: + * limits->k.min <= pll_k <= limits->k.max + * Then: + * -0.5 <= (pll_k / 65536) < 0.5 + * Therefore: + * -Ffref / (2 * 2^(pll_s)) <= output_k < Ffref / (2 * 2^(pll_s)) + */ + + /* Compute output M component (in mHz) */ + output_m = DIV_ROUND_CLOSEST_ULL(mul_u32_u32(p.m, fref) * MILLI, + divider); + /* Compute range for output K (in mHz) */ + output_k_range = DIV_ROUND_CLOSEST_ULL(mul_u32_u32(fref, MILLI), + 2 * divider); + /* + * No point in continuing if we can't achieve + * the desired frequency + */ + if (freq_millihz < (output_m - output_k_range) || + freq_millihz >= (output_m + output_k_range)) { + continue; + } + + /* + * Compute the K component + * + * Since: + * Freq = output_m + output_k + * Then: + * output_k = Freq - output_m + * = ((pll_k / 65536) * Ffref) / 2^(pll_s) + * Therefore: + * pll_k = (output_k * 65536 * 2^(pll_s)) / Ffref + */ + output_k = freq_millihz - output_m; + pll_k = div_s64(output_k * 65536ULL * divider, + fref); + pll_k = DIV_S64_ROUND_CLOSEST(pll_k, MILLI); + + /* Validate K value within allowed limits */ + if (pll_k < limits->k.min || + pll_k > limits->k.max) + continue; + + p.k = pll_k; + + /* Compute (Ffvco * 65536) */ + fvco = mul_u32_u32(p.m * 65536 + p.k, fref); + if (fvco < mul_u32_u32(limits->fvco.min, 65536) || + fvco > mul_u32_u32(limits->fvco.max, 65536)) + continue; + + /* PLL_M component of (output * 65536 * PLL_P) */ + output = mul_u32_u32(p.m * 65536, RZ_V2H_OSC_CLK_IN_MEGA); + /* PLL_K component of (output * 65536 * PLL_P) */ + output += p.k * RZ_V2H_OSC_CLK_IN_MEGA; + /* Make it in mHz */ + output *= MILLI; + output = DIV_U64_ROUND_CLOSEST(output, 65536 * p.p * divider); + + /* Check output frequency against limits */ + if (output < fout_min_millihz || + output > fout_max_millihz) + continue; + + p.error_millihz = freq_millihz - output; + p.freq_millihz = output; + + /* If an exact match is found, return immediately */ + if (p.error_millihz == 0) { + *pars = p; + return true; + } + + /* Update best match if error is smaller */ + if (abs(best.error_millihz) > abs(p.error_millihz)) + best = p; + } + } + } + + /* If no valid parameters were found, return false */ + if (best.error_millihz == S64_MAX) + return false; + + *pars = best; + return true; +} +EXPORT_SYMBOL_NS_GPL(rzv2h_get_pll_pars, "RZV2H_CPG"); + +/* + * rzv2h_get_pll_divs_pars - Finds the best combination of PLL parameters + * and divider value for a given frequency. + * + * @limits: Pointer to the structure containing the limits for the PLL parameters + * @pars: Pointer to the structure where the best calculated PLL parameters and + * divider values will be stored + * @table: Pointer to the array of valid divider values + * @table_size: Size of the divider values array + * @freq_millihz: Target output frequency in millihertz + * + * This function calculates the best set of PLL parameters (M, K, P, S) and divider + * value to achieve the desired frequency. See rzv2h_get_pll_pars() for more details + * on how the PLL parameters are calculated. + * + * freq_millihz is the desired frequency generated by the PLL followed by a + * a gear. + */ +bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_div_pars *pars, + const u8 *table, u8 table_size, u64 freq_millihz) +{ + struct rzv2h_pll_div_pars p, best; + + best.div.error_millihz = S64_MAX; + p.div.error_millihz = S64_MAX; + for (unsigned int i = 0; i < table_size; i++) { + if (!rzv2h_get_pll_pars(limits, &p.pll, freq_millihz * table[i])) + continue; + + p.div.divider_value = table[i]; + p.div.freq_millihz = DIV_U64_ROUND_CLOSEST(p.pll.freq_millihz, table[i]); + p.div.error_millihz = freq_millihz - p.div.freq_millihz; + + if (p.div.error_millihz == 0) { + *pars = p; + return true; + } + + if (abs(best.div.error_millihz) > abs(p.div.error_millihz)) + best = p; + } + + if (best.div.error_millihz == S64_MAX) + return false; + + *pars = best; + return true; +} +EXPORT_SYMBOL_NS_GPL(rzv2h_get_pll_divs_pars, "RZV2H_CPG"); + +static unsigned long rzv2h_cpg_plldsi_div_recalc_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct rzv2h_plldsi_div_clk *dsi_div = to_plldsi_div_clk(hw); + struct rzv2h_cpg_priv *priv = dsi_div->priv; + struct ddiv ddiv = dsi_div->ddiv; + u32 div; + + div = readl(priv->base + ddiv.offset); + div >>= ddiv.shift; + div &= clk_div_mask(ddiv.width); + div = dsi_div->dtable[div].div; + + return DIV_ROUND_CLOSEST_ULL(parent_rate, div); +} + +static int rzv2h_cpg_plldsi_div_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) +{ + struct rzv2h_plldsi_div_clk *dsi_div = to_plldsi_div_clk(hw); + struct pll_clk *pll_clk = to_pll(clk_hw_get_parent(hw)); + struct rzv2h_cpg_priv *priv = dsi_div->priv; + u8 table[RZV2H_MAX_DIV_TABLES] = { 0 }; + struct rzv2h_pll_div_pars *dsi_params; + struct rzv2h_pll_dsi_info *dsi_info; + const struct clk_div_table *div; + unsigned int i = 0; + u64 rate_millihz; + + dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance]; + dsi_params = &dsi_info->pll_dsi_parameters; + + rate_millihz = mul_u32_u32(req->rate, MILLI); + if (rate_millihz == dsi_params->div.error_millihz + dsi_params->div.freq_millihz) + goto exit_determine_rate; + + for (div = dsi_div->dtable; div->div; div++) { + if (i >= RZV2H_MAX_DIV_TABLES) + return -EINVAL; + table[i++] = div->div; + } + + if (!rzv2h_get_pll_divs_pars(dsi_info->pll_dsi_limits, dsi_params, table, i, + rate_millihz)) { + dev_err(priv->dev, "failed to determine rate for req->rate: %lu\n", + req->rate); + return -EINVAL; + } + +exit_determine_rate: + req->rate = DIV_ROUND_CLOSEST_ULL(dsi_params->div.freq_millihz, MILLI); + req->best_parent_rate = req->rate * dsi_params->div.divider_value; + dsi_info->req_pll_dsi_rate = req->best_parent_rate; + + return 0; +} + +static int rzv2h_cpg_plldsi_div_set_rate(struct clk_hw *hw, + unsigned long rate, + unsigned long parent_rate) +{ + struct rzv2h_plldsi_div_clk *dsi_div = to_plldsi_div_clk(hw); + struct pll_clk *pll_clk = to_pll(clk_hw_get_parent(hw)); + struct rzv2h_cpg_priv *priv = dsi_div->priv; + struct rzv2h_pll_div_pars *dsi_params; + struct rzv2h_pll_dsi_info *dsi_info; + struct ddiv ddiv = dsi_div->ddiv; + const struct clk_div_table *clkt; + bool divider_found = false; + u32 val, shift; + + dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance]; + dsi_params = &dsi_info->pll_dsi_parameters; + + for (clkt = dsi_div->dtable; clkt->div; clkt++) { + if (clkt->div == dsi_params->div.divider_value) { + divider_found = true; + break; + } + } + + if (!divider_found) + return -EINVAL; + + shift = ddiv.shift; + val = readl(priv->base + ddiv.offset) | DDIV_DIVCTL_WEN(shift); + val &= ~(clk_div_mask(ddiv.width) << shift); + val |= clkt->val << shift; + writel(val, priv->base + ddiv.offset); + + return 0; +} + +static const struct clk_ops rzv2h_cpg_plldsi_div_ops = { + .recalc_rate = rzv2h_cpg_plldsi_div_recalc_rate, + .determine_rate = rzv2h_cpg_plldsi_div_determine_rate, + .set_rate = rzv2h_cpg_plldsi_div_set_rate, +}; + +static struct clk * __init +rzv2h_cpg_plldsi_div_clk_register(const struct cpg_core_clk *core, + struct rzv2h_cpg_priv *priv) +{ + struct rzv2h_plldsi_div_clk *clk_hw_data; + struct clk **clks = priv->clks; + struct clk_init_data init; + const struct clk *parent; + const char *parent_name; + struct clk_hw *clk_hw; + int ret; + + parent = clks[core->parent]; + if (IS_ERR(parent)) + return ERR_CAST(parent); + + clk_hw_data = devm_kzalloc(priv->dev, sizeof(*clk_hw_data), GFP_KERNEL); + if (!clk_hw_data) + return ERR_PTR(-ENOMEM); + + clk_hw_data->priv = priv; + clk_hw_data->ddiv = core->cfg.ddiv; + clk_hw_data->dtable = core->dtable; + + parent_name = __clk_get_name(parent); + init.name = core->name; + init.ops = &rzv2h_cpg_plldsi_div_ops; + init.flags = core->flag; + init.parent_names = &parent_name; + init.num_parents = 1; + + clk_hw = &clk_hw_data->hw; + clk_hw->init = &init; + + ret = devm_clk_hw_register(priv->dev, clk_hw); + if (ret) + return ERR_PTR(ret); + + return clk_hw->clk; +} + +static int rzv2h_cpg_plldsi_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) +{ + struct pll_clk *pll_clk = to_pll(hw); + struct rzv2h_cpg_priv *priv = pll_clk->priv; + struct rzv2h_pll_dsi_info *dsi_info; + u64 rate_millihz; + + dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance]; + /* check if the divider has already invoked the algorithm */ + if (req->rate == dsi_info->req_pll_dsi_rate) + return 0; + + /* If the req->rate doesn't match we do the calculation assuming there is no divider */ + rate_millihz = mul_u32_u32(req->rate, MILLI); + if (!rzv2h_get_pll_pars(dsi_info->pll_dsi_limits, + &dsi_info->pll_dsi_parameters.pll, rate_millihz)) { + dev_err(priv->dev, + "failed to determine rate for req->rate: %lu\n", + req->rate); + return -EINVAL; + } + + req->rate = DIV_ROUND_CLOSEST_ULL(dsi_info->pll_dsi_parameters.pll.freq_millihz, MILLI); + dsi_info->req_pll_dsi_rate = req->rate; + + return 0; +} + +static int rzv2h_cpg_pll_set_rate(struct pll_clk *pll_clk, + struct rzv2h_pll_pars *params, + bool ssc_disable) +{ + struct rzv2h_cpg_priv *priv = pll_clk->priv; + u16 offset = pll_clk->pll.offset; + u32 val; + int ret; + + /* Put PLL into standby mode */ + writel(CPG_PLL_STBY_RESETB_WEN, priv->base + CPG_PLL_STBY(offset)); + ret = readl_poll_timeout_atomic(priv->base + CPG_PLL_MON(offset), + val, !(val & CPG_PLL_MON_LOCK), + 100, 2000); + if (ret) { + dev_err(priv->dev, "Failed to put PLLDSI into standby mode"); + return ret; + } + + /* Output clock setting 1 */ + writel(FIELD_PREP(CPG_PLL_CLK1_KDIV, (u16)params->k) | + FIELD_PREP(CPG_PLL_CLK1_MDIV, params->m) | + FIELD_PREP(CPG_PLL_CLK1_PDIV, params->p), + priv->base + CPG_PLL_CLK1(offset)); + + /* Output clock setting 2 */ + val = readl(priv->base + CPG_PLL_CLK2(offset)); + writel((val & ~CPG_PLL_CLK2_SDIV) | FIELD_PREP(CPG_PLL_CLK2_SDIV, params->s), + priv->base + CPG_PLL_CLK2(offset)); + + /* Put PLL to normal mode */ + if (ssc_disable) + val = CPG_PLL_STBY_SSC_EN_WEN; + else + val = CPG_PLL_STBY_SSC_EN_WEN | CPG_PLL_STBY_SSC_EN; + writel(val | CPG_PLL_STBY_RESETB_WEN | CPG_PLL_STBY_RESETB, + priv->base + CPG_PLL_STBY(offset)); + + /* PLL normal mode transition, output clock stability check */ + ret = readl_poll_timeout_atomic(priv->base + CPG_PLL_MON(offset), + val, (val & CPG_PLL_MON_LOCK), + 100, 2000); + if (ret) { + dev_err(priv->dev, "Failed to put PLLDSI into normal mode"); + return ret; + } + + return 0; +} + +static int rzv2h_cpg_plldsi_set_rate(struct clk_hw *hw, unsigned long rate, + unsigned long parent_rate) +{ + struct pll_clk *pll_clk = to_pll(hw); + struct rzv2h_pll_dsi_info *dsi_info; + struct rzv2h_cpg_priv *priv = pll_clk->priv; + + dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance]; + + return rzv2h_cpg_pll_set_rate(pll_clk, &dsi_info->pll_dsi_parameters.pll, true); +} + static int rzv2h_cpg_pll_clk_is_enabled(struct clk_hw *hw) { struct pll_clk *pll_clk = to_pll(hw); @@ -238,6 +719,12 @@ static unsigned long rzv2h_cpg_pll_clk_recalc_rate(struct clk_hw *hw, return DIV_ROUND_CLOSEST_ULL(rate, FIELD_GET(CPG_PLL_CLK1_PDIV, clk1)); } +static const struct clk_ops rzv2h_cpg_plldsi_ops = { + .recalc_rate = rzv2h_cpg_pll_clk_recalc_rate, + .determine_rate = rzv2h_cpg_plldsi_determine_rate, + .set_rate = rzv2h_cpg_plldsi_set_rate, +}; + static const struct clk_ops rzv2h_cpg_pll_ops = { .is_enabled = rzv2h_cpg_pll_clk_is_enabled, .enable = rzv2h_cpg_pll_clk_enable, @@ -264,6 +751,10 @@ rzv2h_cpg_pll_clk_register(const struct cpg_core_clk *core, if (!pll_clk) return ERR_PTR(-ENOMEM); + if (core->type == CLK_TYPE_PLLDSI) + priv->pll_dsi_info[core->cfg.pll.instance].pll_dsi_limits = + core->cfg.pll.limits; + parent_name = __clk_get_name(parent); init.name = core->name; init.ops = ops; @@ -588,6 +1079,12 @@ rzv2h_cpg_register_core_clk(const struct cpg_core_clk *core, case CLK_TYPE_SMUX: clk = rzv2h_cpg_mux_clk_register(core, priv); break; + case CLK_TYPE_PLLDSI: + clk = rzv2h_cpg_pll_clk_register(core, priv, &rzv2h_cpg_plldsi_ops); + break; + case CLK_TYPE_PLLDSI_DIV: + clk = rzv2h_cpg_plldsi_div_clk_register(core, priv); + break; default: goto fail; } diff --git a/drivers/clk/renesas/rzv2h-cpg.h b/drivers/clk/renesas/rzv2h-cpg.h index e2053049c299..637803bc1e89 100644 --- a/drivers/clk/renesas/rzv2h-cpg.h +++ b/drivers/clk/renesas/rzv2h-cpg.h @@ -22,15 +22,20 @@ struct pll { unsigned int offset:9; unsigned int has_clkn:1; unsigned int instance:2; + const struct rzv2h_pll_limits *limits; }; -#define PLL_PACK(_offset, _has_clkn, _instance) \ +#define PLL_PACK_LIMITS(_offset, _has_clkn, _instance, _limits) \ ((struct pll){ \ .offset = _offset, \ .has_clkn = _has_clkn, \ - .instance = _instance \ + .instance = _instance, \ + .limits = _limits \ }) +#define PLL_PACK(_offset, _has_clkn, _instance) \ + PLL_PACK_LIMITS(_offset, _has_clkn, _instance, NULL) + #define PLLCA55 PLL_PACK(0x60, 1, 0) #define PLLGPU PLL_PACK(0x120, 1, 0) @@ -191,6 +196,8 @@ enum clk_types { CLK_TYPE_PLL, CLK_TYPE_DDIV, /* Dynamic Switching Divider */ CLK_TYPE_SMUX, /* Static Mux */ + CLK_TYPE_PLLDSI, /* PLLDSI */ + CLK_TYPE_PLLDSI_DIV, /* PLLDSI divider */ }; #define DEF_TYPE(_name, _id, _type...) \ @@ -221,6 +228,14 @@ enum clk_types { .num_parents = ARRAY_SIZE(_parent_names), \ .flag = CLK_SET_RATE_PARENT, \ .mux_flags = CLK_MUX_HIWORD_MASK) +#define DEF_PLLDSI(_name, _id, _parent, _pll_packed) \ + DEF_TYPE(_name, _id, CLK_TYPE_PLLDSI, .parent = _parent, .cfg.pll = _pll_packed) +#define DEF_PLLDSI_DIV(_name, _id, _parent, _ddiv_packed, _dtable) \ + DEF_TYPE(_name, _id, CLK_TYPE_PLLDSI_DIV, \ + .cfg.ddiv = _ddiv_packed, \ + .dtable = _dtable, \ + .parent = _parent, \ + .flag = CLK_SET_RATE_PARENT) /** * struct rzv2h_mod_clk - Module Clocks definitions diff --git a/include/linux/clk/renesas.h b/include/linux/clk/renesas.h index 0ebbe2f0b45e..69d8159deee3 100644 --- a/include/linux/clk/renesas.h +++ b/include/linux/clk/renesas.h @@ -10,7 +10,9 @@ #ifndef __LINUX_CLK_RENESAS_H_ #define __LINUX_CLK_RENESAS_H_ +#include #include +#include struct device; struct device_node; @@ -32,4 +34,147 @@ void cpg_mssr_detach_dev(struct generic_pm_domain *unused, struct device *dev); #define cpg_mssr_attach_dev NULL #define cpg_mssr_detach_dev NULL #endif + +/** + * struct rzv2h_pll_limits - PLL parameter constraints + * + * This structure defines the minimum and maximum allowed values for + * various parameters used to configure a PLL. These limits ensure + * the PLL operates within valid and stable ranges. + * + * @fout: Output frequency range (in MHz) + * @fout.min: Minimum allowed output frequency + * @fout.max: Maximum allowed output frequency + * + * @fvco: PLL oscillation frequency range (in MHz) + * @fvco.min: Minimum allowed VCO frequency + * @fvco.max: Maximum allowed VCO frequency + * + * @m: Main-divider range + * @m.min: Minimum main-divider value + * @m.max: Maximum main-divider value + * + * @p: Pre-divider range + * @p.min: Minimum pre-divider value + * @p.max: Maximum pre-divider value + * + * @s: Divider range + * @s.min: Minimum divider value + * @s.max: Maximum divider value + * + * @k: Delta-sigma modulator range (signed) + * @k.min: Minimum delta-sigma value + * @k.max: Maximum delta-sigma value + */ +struct rzv2h_pll_limits { + struct { + u32 min; + u32 max; + } fout; + + struct { + u32 min; + u32 max; + } fvco; + + struct { + u16 min; + u16 max; + } m; + + struct { + u8 min; + u8 max; + } p; + + struct { + u8 min; + u8 max; + } s; + + struct { + s16 min; + s16 max; + } k; +}; + +/** + * struct rzv2h_pll_pars - PLL configuration parameters + * + * This structure contains the configuration parameters for the + * Phase-Locked Loop (PLL), used to achieve a specific output frequency. + * + * @m: Main divider value + * @p: Pre-divider value + * @s: Output divider value + * @k: Delta-sigma modulation value + * @freq_millihz: Calculated PLL output frequency in millihertz + * @error_millihz: Frequency error from target in millihertz (signed) + */ +struct rzv2h_pll_pars { + u16 m; + u8 p; + u8 s; + s16 k; + u64 freq_millihz; + s64 error_millihz; +}; + +/** + * struct rzv2h_pll_div_pars - PLL parameters with post-divider + * + * This structure is used for PLLs that include an additional post-divider + * stage after the main PLL block. It contains both the PLL configuration + * parameters and the resulting frequency/error values after the divider. + * + * @pll: Main PLL configuration parameters (see struct rzv2h_pll_pars) + * + * @div: Post-divider configuration and result + * @div.divider_value: Divider applied to the PLL output + * @div.freq_millihz: Output frequency after divider in millihertz + * @div.error_millihz: Frequency error from target in millihertz (signed) + */ +struct rzv2h_pll_div_pars { + struct rzv2h_pll_pars pll; + struct { + u8 divider_value; + u64 freq_millihz; + s64 error_millihz; + } div; +}; + +#define RZV2H_CPG_PLL_DSI_LIMITS(name) \ + static const struct rzv2h_pll_limits (name) = { \ + .fout = { .min = 25 * MEGA, .max = 375 * MEGA }, \ + .fvco = { .min = 1600 * MEGA, .max = 3200 * MEGA }, \ + .m = { .min = 64, .max = 533 }, \ + .p = { .min = 1, .max = 4 }, \ + .s = { .min = 0, .max = 6 }, \ + .k = { .min = -32768, .max = 32767 }, \ + } \ + +#ifdef CONFIG_CLK_RZV2H +bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_pars *pars, u64 freq_millihz); + +bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_div_pars *pars, + const u8 *table, u8 table_size, u64 freq_millihz); +#else +static inline bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_pars *pars, + u64 freq_millihz) +{ + return false; +} + +static inline bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_div_pars *pars, + const u8 *table, u8 table_size, + u64 freq_millihz) +{ + return false; +} +#endif + #endif -- cgit v1.2.3 From fd714986e4e46effa6697b13d32918fc59608ccb Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 22 Oct 2025 19:21:09 -0700 Subject: iommu: Pass in old domain to attach_dev callback functions The IOMMU core attaches each device to a default domain on probe(). Then, every new "attach" operation has a fundamental meaning of two-fold: - detach from its currently attached (old) domain - attach to a given new domain Modern IOMMU drivers following this pattern usually want to clean up the things related to the old domain, so they call iommu_get_domain_for_dev() to fetch the old domain. Pass in the old domain pointer from the core to drivers, aligning with the set_dev_pasid op that does so already. Ensure all low-level attach fcuntions in the core can forward the correct old domain pointer. Thus, rework those functions as well. Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- arch/powerpc/kernel/iommu.c | 5 +++-- drivers/iommu/amd/iommu.c | 11 +++++----- drivers/iommu/apple-dart.c | 9 +++++--- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 5 +++-- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 24 +++++++++++++-------- drivers/iommu/arm/arm-smmu/arm-smmu.c | 9 +++++--- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 11 +++++----- drivers/iommu/exynos-iommu.c | 8 ++++--- drivers/iommu/fsl_pamu_domain.c | 12 +++++------ drivers/iommu/intel/iommu.c | 10 ++++++--- drivers/iommu/intel/nested.c | 2 +- drivers/iommu/iommu.c | 25 +++++++++++++--------- drivers/iommu/iommufd/selftest.c | 2 +- drivers/iommu/ipmmu-vmsa.c | 10 ++++----- drivers/iommu/msm_iommu.c | 11 +++++----- drivers/iommu/mtk_iommu.c | 8 +++---- drivers/iommu/mtk_iommu_v1.c | 7 ++++-- drivers/iommu/omap-iommu.c | 12 +++++------ drivers/iommu/riscv/iommu.c | 9 +++++--- drivers/iommu/rockchip-iommu.c | 20 ++++++++++++----- drivers/iommu/s390-iommu.c | 13 ++++++----- drivers/iommu/sprd-iommu.c | 3 ++- drivers/iommu/sun50i-iommu.c | 8 ++++--- drivers/iommu/tegra-smmu.c | 10 ++++----- drivers/iommu/virtio-iommu.c | 6 ++++-- include/linux/iommu.h | 3 ++- 26 files changed, 153 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 244eb4857e7f..b7dcf07b2499 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1156,7 +1156,8 @@ EXPORT_SYMBOL_GPL(iommu_add_device); */ static int spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_table_group *table_group; @@ -1189,7 +1190,7 @@ static struct iommu_domain spapr_tce_platform_domain = { static int spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_group *grp = iommu_group_get(dev); struct iommu_table_group *table_group; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 6f4559eb5121..e16ad510c8c8 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -70,8 +70,8 @@ int amd_iommu_max_glx_val = -1; */ DEFINE_IDA(pdom_ids); -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev); +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old); static void set_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data); @@ -2635,7 +2635,8 @@ void amd_iommu_domain_free(struct iommu_domain *dom) } static int blocked_domain_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); @@ -2685,8 +2686,8 @@ void amd_iommu_init_identity_domain(void) protection_domain_init(&identity_domain); } -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev) +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 95a4e62b8f63..b5848770ef48 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -672,7 +672,8 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain, } static int apple_dart_attach_dev_paging(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret, i; struct apple_dart_stream_map *stream_map; @@ -693,7 +694,8 @@ static int apple_dart_attach_dev_paging(struct iommu_domain *domain, } static int apple_dart_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; @@ -717,7 +719,8 @@ static struct iommu_domain apple_dart_identity_domain = { }; static int apple_dart_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 8cd8929bbfdf..313201a61699 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -138,14 +138,15 @@ void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) } static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_nested_domain *nested_domain = to_smmu_nested_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_attach_state state = { .master = master, - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; struct arm_smmu_ste ste; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 2125ebfc9a70..a33fbd12a0dd 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3002,7 +3002,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) master->ats_enabled = state->ats_enabled; } -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old_domain) { int ret = 0; struct arm_smmu_ste target; @@ -3010,7 +3011,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) struct arm_smmu_device *smmu; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_attach_state state = { - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; struct arm_smmu_master *master; @@ -3186,7 +3187,7 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, /* * When the last user of the CD table goes away downgrade the STE back - * to a non-cd_table one. + * to a non-cd_table one, by re-attaching its sid_domain. */ if (!arm_smmu_ssids_in_use(&master->cd_table)) { struct iommu_domain *sid_domain = @@ -3194,12 +3195,14 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, if (sid_domain->type == IOMMU_DOMAIN_IDENTITY || sid_domain->type == IOMMU_DOMAIN_BLOCKED) - sid_domain->ops->attach_dev(sid_domain, dev); + sid_domain->ops->attach_dev(sid_domain, dev, + sid_domain); } return 0; } static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, + struct iommu_domain *old_domain, struct device *dev, struct arm_smmu_ste *ste, unsigned int s1dss) @@ -3207,7 +3210,7 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_attach_state state = { .master = master, - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; @@ -3248,14 +3251,16 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, } static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); arm_smmu_master_clear_vmaster(master); arm_smmu_make_bypass_ste(master->smmu, &ste); - arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS); + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, + STRTAB_STE_1_S1DSS_BYPASS); return 0; } @@ -3269,14 +3274,15 @@ static struct iommu_domain arm_smmu_identity_domain = { }; static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); arm_smmu_master_clear_vmaster(master); arm_smmu_make_abort_ste(&ste); - arm_smmu_attach_dev_ste(domain, dev, &ste, + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, STRTAB_STE_1_S1DSS_TERMINATE); return 0; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 4ced4b5bee4d..5e690cf85ec9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1165,7 +1165,8 @@ static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg, } } -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -1234,7 +1235,8 @@ static int arm_smmu_attach_dev_type(struct device *dev, } static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS); } @@ -1249,7 +1251,8 @@ static struct iommu_domain arm_smmu_identity_domain = { }; static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT); } diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index c5be95e56031..9222a4a48bb3 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -359,7 +359,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain) kfree(qcom_domain); } -static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int qcom_iommu_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) { struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); @@ -388,18 +389,18 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev } static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct qcom_iommu_domain *qcom_domain; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); unsigned int i; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - qcom_domain = to_qcom_iommu_domain(domain); + qcom_domain = to_qcom_iommu_domain(old); if (WARN_ON(!qcom_domain->iommu)) return -EINVAL; diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 0857519ca718..e375ced6e2b0 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -984,7 +984,8 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain) } static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); struct exynos_iommu_domain *domain; @@ -1035,7 +1036,8 @@ static struct iommu_domain exynos_identity_domain = { }; static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); @@ -1044,7 +1046,7 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, unsigned long flags; int err; - err = exynos_iommu_identity_attach(&exynos_identity_domain, dev); + err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old); if (err) return err; diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index 5f08523f97cb..9664ef9840d2 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -238,7 +238,7 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val) } static int fsl_pamu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain); unsigned long flags; @@ -298,9 +298,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, * switches to what looks like BLOCKING. */ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct fsl_dma_domain *dma_domain; const u32 *prop; int len; @@ -311,11 +311,11 @@ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, * Hack to keep things working as they always have, only leaving an * UNMANAGED domain makes it BLOCKING. */ - if (domain == platform_domain || !domain || - domain->type != IOMMU_DOMAIN_UNMANAGED) + if (old == platform_domain || !old || + old->type != IOMMU_DOMAIN_UNMANAGED) return 0; - dma_domain = to_fsl_dma_domain(domain); + dma_domain = to_fsl_dma_domain(old); /* * Use LIODN of the PCI controller while detaching a diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e236c7ec221f..f0396591cd9b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3230,7 +3230,8 @@ void device_block_translation(struct device *dev) } static int blocking_domain_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); @@ -3537,7 +3538,8 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) } static int intel_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret; @@ -4401,7 +4403,9 @@ static int device_setup_pass_through(struct device *dev) context_setup_pass_through_cb, dev); } -static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev) +static int identity_domain_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 1b6ad9c900a5..760d7aa2ade8 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -19,7 +19,7 @@ #include "pasid.h" static int intel_nested_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index ce141f095f96..2ca990dfbb88 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -100,7 +100,7 @@ static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev); + struct device *dev, struct iommu_domain *old); static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev, @@ -114,6 +114,7 @@ enum { static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, + struct iommu_domain *old_domain, unsigned int flags); static int __iommu_group_set_domain_internal(struct iommu_group *group, struct iommu_domain *new_domain, @@ -554,7 +555,8 @@ static void iommu_deinit_device(struct device *dev) release_domain == ops->blocked_domain) release_domain = ops->identity_domain; - release_domain->ops->attach_dev(release_domain, dev); + release_domain->ops->attach_dev(release_domain, dev, + group->domain); } if (ops->release_device) @@ -640,7 +642,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list if (group->default_domain) iommu_create_device_direct_mappings(group->default_domain, dev); if (group->domain) { - ret = __iommu_device_set_domain(group, dev, group->domain, 0); + ret = __iommu_device_set_domain(group, dev, group->domain, NULL, + 0); if (ret) goto err_remove_gdev; } else if (!group->default_domain && !group_list) { @@ -2127,14 +2130,14 @@ static void __iommu_group_set_core_domain(struct iommu_group *group) } static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { int ret; if (unlikely(domain->ops->attach_dev == NULL)) return -ENODEV; - ret = domain->ops->attach_dev(domain, dev); + ret = domain->ops->attach_dev(domain, dev, old); if (ret) return ret; dev->iommu->attach_deferred = 0; @@ -2183,7 +2186,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { if (dev->iommu && dev->iommu->attach_deferred) - return __iommu_attach_device(domain, dev); + return __iommu_attach_device(domain, dev, NULL); return 0; } @@ -2296,6 +2299,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_group); static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, + struct iommu_domain *old_domain, unsigned int flags) { int ret; @@ -2321,7 +2325,7 @@ static int __iommu_device_set_domain(struct iommu_group *group, dev->iommu->attach_deferred = 0; } - ret = __iommu_attach_device(new_domain, dev); + ret = __iommu_attach_device(new_domain, dev, old_domain); if (ret) { /* * If we have a blocking domain then try to attach that in hopes @@ -2331,7 +2335,8 @@ static int __iommu_device_set_domain(struct iommu_group *group, if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) && group->blocking_domain && group->blocking_domain != new_domain) - __iommu_attach_device(group->blocking_domain, dev); + __iommu_attach_device(group->blocking_domain, dev, + old_domain); return ret; } return 0; @@ -2378,7 +2383,7 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, result = 0; for_each_group_device(group, gdev) { ret = __iommu_device_set_domain(group, gdev->dev, new_domain, - flags); + group->domain, flags); if (ret) { result = ret; /* @@ -2413,7 +2418,7 @@ err_revert: */ if (group->domain) WARN_ON(__iommu_device_set_domain( - group, gdev->dev, group->domain, + group, gdev->dev, group->domain, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); } return ret; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index de178827a078..5661d2da2b67 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -216,7 +216,7 @@ static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj) } static int mock_domain_nop_attach(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct mock_dev *mdev = to_mock_dev(dev); struct mock_viommu *new_viommu = NULL; diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index ffa892f65714..6667ecc331f0 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -590,7 +590,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain) } static int ipmmu_attach_device(struct iommu_domain *io_domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); @@ -637,17 +637,17 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, } static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_domain *domain; unsigned int i; - if (io_domain == identity_domain || !io_domain) + if (old == identity_domain || !old) return 0; - domain = to_vmsa_domain(io_domain); + domain = to_vmsa_domain(old); for (i = 0; i < fwspec->num_ids; ++i) ipmmu_utlb_disable(domain, fwspec->ids[i]); diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 43a61ba021a5..819add75a665 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -391,7 +391,8 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev) return &iommu->iommu; } -static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { int ret = 0; unsigned long flags; @@ -441,19 +442,19 @@ fail: } static int msm_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct msm_priv *priv; unsigned long flags; struct msm_iommu_dev *iommu; struct msm_iommu_ctx_dev *master; int ret = 0; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - priv = to_msm_priv(domain); + priv = to_msm_priv(old); free_io_pgtable_ops(priv->iop); spin_lock_irqsave(&msm_iommu_lock, flags); diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 0e0285348d2b..9747ef164413 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -705,7 +705,7 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain) } static int mtk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata; struct mtk_iommu_domain *dom = to_mtk_domain(domain); @@ -773,12 +773,12 @@ err_unlock: } static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct mtk_iommu_data *data = dev_iommu_priv_get(dev); - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; mtk_iommu_config(data, dev, false, 0); diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 10cc0b1197e8..3b45650263ac 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -303,7 +303,9 @@ static void mtk_iommu_v1_domain_free(struct iommu_domain *domain) kfree(to_mtk_domain(domain)); } -static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev) +static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain); @@ -329,7 +331,8 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device } static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 5c6f5943f44b..9f0057ccea57 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1431,8 +1431,8 @@ static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain) odomain->iommus = NULL; } -static int -omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int omap_iommu_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) { struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev); struct omap_iommu_domain *omap_domain = to_omap_domain(domain); @@ -1536,15 +1536,15 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain, } static int omap_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct omap_iommu_domain *omap_domain; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - omap_domain = to_omap_domain(domain); + omap_domain = to_omap_domain(old); spin_lock(&omap_domain->lock); _omap_iommu_detach_dev(omap_domain, dev); spin_unlock(&omap_domain->lock); diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index ebb22979075d..d9429097a2b5 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -1321,7 +1321,8 @@ static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_m } static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); struct riscv_iommu_device *iommu = dev_to_iommu(dev); @@ -1426,7 +1427,8 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) } static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); @@ -1447,7 +1449,8 @@ static struct iommu_domain riscv_iommu_blocking_domain = { }; static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 0861dd469bd8..85f3667e797c 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -960,7 +960,8 @@ out_disable_clocks: } static int rk_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain; @@ -1005,7 +1006,7 @@ static struct iommu_domain rk_identity_domain = { }; static int rk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain = to_rk_domain(domain); @@ -1026,7 +1027,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - ret = rk_iommu_identity_attach(&rk_identity_domain, dev); + ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old); if (ret) return ret; @@ -1041,8 +1042,17 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, return 0; ret = rk_iommu_enable(iommu); - if (ret) - WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev)); + if (ret) { + /* + * Note rk_iommu_identity_attach() might fail before physically + * attaching the dev to iommu->domain, in which case the actual + * old domain for this revert should be rk_identity_domain v.s. + * iommu->domain. Since rk_iommu_identity_attach() does not care + * about the old domain argument for now, this is not a problem. + */ + WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev, + iommu->domain)); + } pm_runtime_put(iommu->dev); diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index aa576736d60b..fe679850af28 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -670,7 +670,8 @@ int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status) } static int blocking_domain_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct zpci_dev *zdev = to_zpci_dev(dev); struct s390_domain *s390_domain; @@ -694,7 +695,8 @@ static int blocking_domain_attach_device(struct iommu_domain *domain, } static int s390_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); @@ -709,7 +711,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, domain->geometry.aperture_end < zdev->start_dma)) return -EINVAL; - blocking_domain_attach_device(&blocking_domain, dev); + blocking_domain_attach_device(&blocking_domain, dev, old); /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); @@ -1131,13 +1133,14 @@ static int __init s390_iommu_init(void) subsys_initcall(s390_iommu_init); static int s390_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct zpci_dev *zdev = to_zpci_dev(dev); u8 status; int cc; - blocking_domain_attach_device(&blocking_domain, dev); + blocking_domain_attach_device(&blocking_domain, dev, old); /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index c7ca1d8a0b15..555d4505c747 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -247,7 +247,8 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain) } static int sprd_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev); struct sprd_iommu_domain *dom = to_sprd_domain(domain); diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index de10b569d9a9..d3b190be18b5 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -771,7 +771,8 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu, } static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sun50i_iommu *iommu = dev_iommu_priv_get(dev); struct sun50i_iommu_domain *sun50i_domain; @@ -797,7 +798,8 @@ static struct iommu_domain sun50i_iommu_identity_domain = { }; static int sun50i_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu; @@ -813,7 +815,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev); + sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old); sun50i_iommu_attach_domain(iommu, sun50i_domain); diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 36cdd5fbab07..336e0a3ff41f 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -490,7 +490,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu, } static int tegra_smmu_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu *smmu = dev_iommu_priv_get(dev); @@ -524,9 +524,9 @@ disable: } static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu_as *as; struct tegra_smmu *smmu; @@ -535,10 +535,10 @@ static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, if (!fwspec) return -ENODEV; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - as = to_smmu_as(domain); + as = to_smmu_as(old); smmu = as->smmu; for (index = 0; index < fwspec->num_ids; index++) { tegra_smmu_disable(smmu, fwspec->ids[index], as->id); diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index b39d6f134ab2..d314fa5cd847 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -730,7 +730,8 @@ static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev) return domain; } -static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { int ret = 0; struct virtio_iommu_req_attach req; @@ -781,7 +782,8 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) } static int viommu_attach_identity_domain(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret = 0; struct virtio_iommu_req_attach req; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c30d12e16473..801b2bd9e8d4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -751,7 +751,8 @@ struct iommu_ops { * @free: Release the domain after use. */ struct iommu_domain_ops { - int (*attach_dev)(struct iommu_domain *domain, struct device *dev); + int (*attach_dev)(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old); int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old); -- cgit v1.2.3 From 483768846d66c04354898f00bcdaad58a3763be2 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 15 Oct 2025 11:27:28 -0400 Subject: PCI: endpoint: Rename 'epf_bar::aligned_size' to 'epf_bar:mem_size' Rename the member 'epf_bar::aligned_size' to 'epf_bar::mem_size' to better reflect its purpose. 'aligned_size' was misleading, as it actually represents the backing memory size allocated for the BAR rather than the aligned size. Signed-off-by: Frank Li Signed-off-by: Manivannan Sadhasivam Reviewed-by: Niklas Cassel Link: https://patch.msgid.link/20251015-vntb_msi_doorbell-v6-1-9230298b1910@nxp.com --- drivers/pci/endpoint/pci-epf-core.c | 12 ++++++------ include/linux/pci-epf.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index d54e18872aef..214e3f6e6d0d 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -236,13 +236,13 @@ void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar, } dev = epc->dev.parent; - dma_free_coherent(dev, epf_bar[bar].aligned_size, addr, + dma_free_coherent(dev, epf_bar[bar].mem_size, addr, epf_bar[bar].phys_addr); epf_bar[bar].phys_addr = 0; epf_bar[bar].addr = NULL; epf_bar[bar].size = 0; - epf_bar[bar].aligned_size = 0; + epf_bar[bar].mem_size = 0; epf_bar[bar].barno = 0; epf_bar[bar].flags = 0; } @@ -265,7 +265,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, enum pci_epc_interface_type type) { u64 bar_fixed_size = epc_features->bar[bar].fixed_size; - size_t aligned_size, align = epc_features->align; + size_t mem_size, align = epc_features->align; struct pci_epf_bar *epf_bar; dma_addr_t phys_addr; struct pci_epc *epc; @@ -297,7 +297,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, * it might be different if, for example, the fixed size of a BAR * is smaller than align. */ - aligned_size = align ? ALIGN(size, align) : size; + mem_size = align ? ALIGN(size, align) : size; if (type == PRIMARY_INTERFACE) { epc = epf->epc; @@ -308,7 +308,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, } dev = epc->dev.parent; - space = dma_alloc_coherent(dev, aligned_size, &phys_addr, GFP_KERNEL); + space = dma_alloc_coherent(dev, mem_size, &phys_addr, GFP_KERNEL); if (!space) { dev_err(dev, "failed to allocate mem space\n"); return NULL; @@ -317,7 +317,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, epf_bar[bar].phys_addr = phys_addr; epf_bar[bar].addr = space; epf_bar[bar].size = size; - epf_bar[bar].aligned_size = aligned_size; + epf_bar[bar].mem_size = mem_size; epf_bar[bar].barno = bar; if (upper_32_bits(size) || epc_features->bar[bar].only_64bit) epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64; diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 2e85504ba2ba..4022dd080e20 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -115,8 +115,8 @@ struct pci_epf_driver { * @phys_addr: physical address that should be mapped to the BAR * @addr: virtual address corresponding to the @phys_addr * @size: the size of the address space present in BAR - * @aligned_size: the size actually allocated to accommodate the iATU alignment - * requirement + * @mem_size: the size actually allocated to accommodate the iATU alignment + * requirement * @barno: BAR number * @flags: flags that are set for the BAR */ @@ -124,7 +124,7 @@ struct pci_epf_bar { dma_addr_t phys_addr; void *addr; size_t size; - size_t aligned_size; + size_t mem_size; enum pci_barno barno; int flags; }; -- cgit v1.2.3 From 83be4bee57f0374ff751aaff3fef4af0af66ec81 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 17 Oct 2025 13:26:28 +0000 Subject: ACPI: PRM: Add acpi_prm_handler_available() Add a helper function to check if a PRM handler/module is present. This can be used during init time by code that depends on a particular handler. If the handler is not present, then the code does not need to be loaded. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: "Mario Limonciello (AMD)" Acked-by: "Rafael J. Wysocki (Intel)" Link: https://patch.msgid.link/all/20251017-wip-atl-prm-v2-1-7ab1df4a5fbc@amd.com --- drivers/acpi/prmt.c | 6 ++++++ include/linux/prmt.h | 2 ++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c index 6792d4385eee..7b8b5d2015ec 100644 --- a/drivers/acpi/prmt.c +++ b/drivers/acpi/prmt.c @@ -244,6 +244,12 @@ static struct prm_handler_info *find_prm_handler(const guid_t *guid) return (struct prm_handler_info *) find_guid_info(guid, GET_HANDLER); } +bool acpi_prm_handler_available(const guid_t *guid) +{ + return find_prm_handler(guid) && find_prm_module(guid); +} +EXPORT_SYMBOL_GPL(acpi_prm_handler_available); + /* In-coming PRM commands */ #define PRM_CMD_RUN_SERVICE 0 diff --git a/include/linux/prmt.h b/include/linux/prmt.h index c53ab287e932..8cdc987de963 100644 --- a/include/linux/prmt.h +++ b/include/linux/prmt.h @@ -4,9 +4,11 @@ #ifdef CONFIG_ACPI_PRMT void init_prmt(void); +bool acpi_prm_handler_available(const guid_t *handler_guid); int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer); #else static inline void init_prmt(void) { } +static inline bool acpi_prm_handler_available(const guid_t *handler_guid) { return false; } static inline int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer) { return -EOPNOTSUPP; -- cgit v1.2.3 From 0bfc6758f213a701bd662982de86f0032b51f18c Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 15 Oct 2025 11:27:30 -0400 Subject: PCI: endpoint: Add pci_epf_assign_bar_space() API Add pci_epf_assign_bar_space() API to allow setting any MMIO address as the BAR memory space, such as an MSI message base address. This API also conforms to the BAR base address and size alignment restrictions enforced by the PCI spec r6.0, sec 7.5.1.2.1. Signed-off-by: Frank Li [mani: removed unused epc var, reworded kdoc, comments and description] Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20251015-vntb_msi_doorbell-v6-3-9230298b1910@nxp.com --- drivers/pci/endpoint/pci-epf-core.c | 77 +++++++++++++++++++++++++++++++++++++ include/linux/pci-epf.h | 6 +++ 2 files changed, 83 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index ec20aabb7f75..9a505c796370 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -346,6 +346,83 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, } EXPORT_SYMBOL_GPL(pci_epf_alloc_space); +/** + * pci_epf_assign_bar_space() - Assign PCI EPF BAR space + * @epf: EPF device to assign the BAR memory + * @size: Size of the memory that has to be assigned + * @bar: BAR number for which the memory is assigned + * @epc_features: Features provided by the EPC specific to this EPF + * @type: Identifies if the assignment is for primary EPC or secondary EPC + * @bar_addr: Address to be assigned for the @bar + * + * Invoke to assign memory for the PCI EPF BAR. + * Flag PCI_BASE_ADDRESS_MEM_TYPE_64 will automatically get set if the BAR + * can only be a 64-bit BAR, or if the requested size is larger than 2 GB. + */ +int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size, + enum pci_barno bar, + const struct pci_epc_features *epc_features, + enum pci_epc_interface_type type, + dma_addr_t bar_addr) +{ + size_t bar_size, aligned_mem_size; + struct pci_epf_bar *epf_bar; + dma_addr_t limit; + int pos; + + if (!size) + return -EINVAL; + + limit = bar_addr + size - 1; + + /* + * Bits: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + * bar_addr: U U U U U U 0 X X X X X X X X X + * limit: U U U U U U 1 X X X X X X X X X + * + * bar_addr^limit 0 0 0 0 0 0 1 X X X X X X X X X + * + * U: unchanged address bits in range [bar_addr, limit] + * X: bit 0 or 1 + * + * (bar_addr^limit) & BIT_ULL(pos) will find the first set bit from MSB + * (pos). And value of (2 ^ pos) should be able to cover the BAR range. + */ + for (pos = 8 * sizeof(dma_addr_t) - 1; pos > 0; pos--) + if ((limit ^ bar_addr) & BIT_ULL(pos)) + break; + + if (pos == 8 * sizeof(dma_addr_t) - 1) + return -EINVAL; + + bar_size = BIT_ULL(pos + 1); + if (pci_epf_get_required_bar_size(epf, &bar_size, &aligned_mem_size, + bar, epc_features, type)) + return -ENOMEM; + + if (type == PRIMARY_INTERFACE) + epf_bar = epf->bar; + else + epf_bar = epf->sec_epc_bar; + + epf_bar[bar].phys_addr = ALIGN_DOWN(bar_addr, aligned_mem_size); + + if (epf_bar[bar].phys_addr + bar_size < limit) + return -ENOMEM; + + epf_bar[bar].addr = NULL; + epf_bar[bar].size = bar_size; + epf_bar[bar].mem_size = aligned_mem_size; + epf_bar[bar].barno = bar; + if (upper_32_bits(size) || epc_features->bar[bar].only_64bit) + epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64; + else + epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_32; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_epf_assign_bar_space); + static void pci_epf_remove_cfs(struct pci_epf_driver *driver) { struct config_group *group, *tmp; diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 4022dd080e20..48f68c4dcfa5 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -242,6 +242,12 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar, enum pci_epc_interface_type type); +int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size, + enum pci_barno bar, + const struct pci_epc_features *epc_features, + enum pci_epc_interface_type type, + dma_addr_t bar_addr); + int pci_epf_align_inbound_addr(struct pci_epf *epf, enum pci_barno bar, u64 addr, dma_addr_t *base, size_t *off); int pci_epf_bind(struct pci_epf *epf); -- cgit v1.2.3 From 013a3a66f25af3fb614f45df43983657514944c4 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:54:55 +0100 Subject: regmap: sdw-mbq: Don't assume the regmap device is the SoundWire slave Currently, the code assumes that the device that registered the MBQ register map is the actual SoundWire slave device. This works fine for all current users, however future SDCA devices will likely be implemented with the SoundWire slave as a parent device and separate child drivers with regmaps for each audio Function. Update the regmap_init_sdw_mbq_cfg macro to allow these two to be specified separately. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-3-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-sdw-mbq.c | 23 ++++++++++++----------- include/linux/regmap.h | 21 +++++++++++---------- sound/soc/codecs/rt722-sdca-sdw.c | 4 +++- 3 files changed, 26 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/regmap/regmap-sdw-mbq.c b/drivers/base/regmap/regmap-sdw-mbq.c index 86644bbd0710..8b7d34a6080d 100644 --- a/drivers/base/regmap/regmap-sdw-mbq.c +++ b/drivers/base/regmap/regmap-sdw-mbq.c @@ -15,6 +15,7 @@ struct regmap_mbq_context { struct device *dev; + struct sdw_slave *sdw; struct regmap_sdw_mbq_cfg cfg; @@ -46,7 +47,7 @@ static bool regmap_sdw_mbq_deferrable(struct regmap_mbq_context *ctx, unsigned i static int regmap_sdw_mbq_poll_busy(struct sdw_slave *slave, unsigned int reg, struct regmap_mbq_context *ctx) { - struct device *dev = &slave->dev; + struct device *dev = ctx->dev; int val, ret = 0; dev_dbg(dev, "Deferring transaction for 0x%x\n", reg); @@ -96,8 +97,7 @@ static int regmap_sdw_mbq_write_impl(struct sdw_slave *slave, static int regmap_sdw_mbq_write(void *context, unsigned int reg, unsigned int val) { struct regmap_mbq_context *ctx = context; - struct device *dev = ctx->dev; - struct sdw_slave *slave = dev_to_sdw_dev(dev); + struct sdw_slave *slave = ctx->sdw; bool deferrable = regmap_sdw_mbq_deferrable(ctx, reg); int mbq_size = regmap_sdw_mbq_size(ctx, reg); int ret; @@ -156,8 +156,7 @@ static int regmap_sdw_mbq_read_impl(struct sdw_slave *slave, static int regmap_sdw_mbq_read(void *context, unsigned int reg, unsigned int *val) { struct regmap_mbq_context *ctx = context; - struct device *dev = ctx->dev; - struct sdw_slave *slave = dev_to_sdw_dev(dev); + struct sdw_slave *slave = ctx->sdw; bool deferrable = regmap_sdw_mbq_deferrable(ctx, reg); int mbq_size = regmap_sdw_mbq_size(ctx, reg); int ret; @@ -208,6 +207,7 @@ static int regmap_sdw_mbq_config_check(const struct regmap_config *config) static struct regmap_mbq_context * regmap_sdw_mbq_gen_context(struct device *dev, + struct sdw_slave *sdw, const struct regmap_config *config, const struct regmap_sdw_mbq_cfg *mbq_config) { @@ -218,6 +218,7 @@ regmap_sdw_mbq_gen_context(struct device *dev, return ERR_PTR(-ENOMEM); ctx->dev = dev; + ctx->sdw = sdw; if (mbq_config) ctx->cfg = *mbq_config; @@ -228,7 +229,7 @@ regmap_sdw_mbq_gen_context(struct device *dev, return ctx; } -struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw, +struct regmap *__regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw, const struct regmap_config *config, const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, @@ -241,16 +242,16 @@ struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw, if (ret) return ERR_PTR(ret); - ctx = regmap_sdw_mbq_gen_context(&sdw->dev, config, mbq_config); + ctx = regmap_sdw_mbq_gen_context(dev, sdw, config, mbq_config); if (IS_ERR(ctx)) return ERR_CAST(ctx); - return __regmap_init(&sdw->dev, ®map_sdw_mbq, ctx, + return __regmap_init(dev, ®map_sdw_mbq, ctx, config, lock_key, lock_name); } EXPORT_SYMBOL_GPL(__regmap_init_sdw_mbq); -struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw, +struct regmap *__devm_regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw, const struct regmap_config *config, const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, @@ -263,11 +264,11 @@ struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw, if (ret) return ERR_PTR(ret); - ctx = regmap_sdw_mbq_gen_context(&sdw->dev, config, mbq_config); + ctx = regmap_sdw_mbq_gen_context(dev, sdw, config, mbq_config); if (IS_ERR(ctx)) return ERR_CAST(ctx); - return __devm_regmap_init(&sdw->dev, ®map_sdw_mbq, ctx, + return __devm_regmap_init(dev, ®map_sdw_mbq, ctx, config, lock_key, lock_name); } EXPORT_SYMBOL_GPL(__devm_regmap_init_sdw_mbq); diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 4e1ac1fbcec4..70daec535976 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -676,7 +676,7 @@ struct regmap *__regmap_init_sdw(struct sdw_slave *sdw, const struct regmap_config *config, struct lock_class_key *lock_key, const char *lock_name); -struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw, +struct regmap *__regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw, const struct regmap_config *config, const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, @@ -738,7 +738,7 @@ struct regmap *__devm_regmap_init_sdw(struct sdw_slave *sdw, const struct regmap_config *config, struct lock_class_key *lock_key, const char *lock_name); -struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw, +struct regmap *__devm_regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw, const struct regmap_config *config, const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, @@ -970,7 +970,7 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); */ #define regmap_init_sdw_mbq(sdw, config) \ __regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config, \ - sdw, config, NULL) + &sdw->dev, sdw, config, NULL) /** * regmap_init_sdw_mbq_cfg() - Initialise MBQ SDW register map with config @@ -983,9 +983,9 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); * to a struct regmap. The regmap will be automatically freed by the * device management code. */ -#define regmap_init_sdw_mbq_cfg(sdw, config, mbq_config) \ +#define regmap_init_sdw_mbq_cfg(dev, sdw, config, mbq_config) \ __regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config, \ - sdw, config, mbq_config) + dev, sdw, config, mbq_config) /** * regmap_init_spi_avmm() - Initialize register map for Intel SPI Slave @@ -1198,12 +1198,13 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); */ #define devm_regmap_init_sdw_mbq(sdw, config) \ __regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, #config, \ - sdw, config, NULL) + &sdw->dev, sdw, config, NULL) /** * devm_regmap_init_sdw_mbq_cfg() - Initialise managed MBQ SDW register map with config * - * @sdw: Device that will be interacted with + * @dev: Device that will be interacted with + * @sdw: SoundWire Device that will be interacted with * @config: Configuration for register map * @mbq_config: Properties for the MBQ registers * @@ -1211,9 +1212,9 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); * to a struct regmap. The regmap will be automatically freed by the * device management code. */ -#define devm_regmap_init_sdw_mbq_cfg(sdw, config, mbq_config) \ - __regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, \ - #config, sdw, config, mbq_config) +#define devm_regmap_init_sdw_mbq_cfg(dev, sdw, config, mbq_config) \ + __regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, \ + #config, dev, sdw, config, mbq_config) /** * devm_regmap_init_slimbus() - Initialise managed register map diff --git a/sound/soc/codecs/rt722-sdca-sdw.c b/sound/soc/codecs/rt722-sdca-sdw.c index 5ea40c1b159a..a0f5601a262a 100644 --- a/sound/soc/codecs/rt722-sdca-sdw.c +++ b/sound/soc/codecs/rt722-sdca-sdw.c @@ -419,7 +419,9 @@ static int rt722_sdca_sdw_probe(struct sdw_slave *slave, struct regmap *regmap; /* Regmap Initialization */ - regmap = devm_regmap_init_sdw_mbq_cfg(slave, &rt722_sdca_regmap, &rt722_mbq_config); + regmap = devm_regmap_init_sdw_mbq_cfg(&slave->dev, slave, + &rt722_sdca_regmap, + &rt722_mbq_config); if (IS_ERR(regmap)) return PTR_ERR(regmap); -- cgit v1.2.3 From 87b0031f7f73dac2ebb874fc8f331a66ee3b5cbd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:18 +0100 Subject: irqdomain: Add firmware info reporting interface Add an irqdomain callback to report firmware-provided information that is otherwise not available in a generic way. This is reported using a new data structure (struct irq_fwspec_info). This callback is optional and the only information that can be reported currently is the affinity of an interrupt. However, the containing structure is designed to be extensible, allowing other potentially relevant information to be reported in the future. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251020122944.3074811-2-maz@kernel.org --- include/linux/irqdomain.h | 27 +++++++++++++++++++++++++++ kernel/irq/irqdomain.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 54 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 4a86e6b915dd..9d6a5e99394f 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -44,6 +44,23 @@ struct irq_fwspec { u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS]; }; +/** + * struct irq_fwspec_info - firmware provided IRQ information structure + * + * @flags: Information validity flags + * @cpumask: Affinity mask for this interrupt + * + * This structure reports firmware-specific information about an + * interrupt. The only significant information is the affinity of a + * per-CPU interrupt, but this is designed to be extended as required. + */ +struct irq_fwspec_info { + unsigned long flags; + const struct cpumask *affinity; +}; + +#define IRQ_FWSPEC_INFO_AFFINITY_VALID BIT(0) + /* Conversion function from of_phandle_args fields to fwspec */ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, unsigned int count, struct irq_fwspec *fwspec); @@ -69,6 +86,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, * @translate: Given @fwspec, decode the hardware irq number (@out_hwirq) and * linux irq type value (@out_type). This is a generalised @xlate * (over struct irq_fwspec) and is preferred if provided. + * @get_fwspec_info: + * Given @fwspec, report additional firmware-provided information in + * @info. Optional. * @debug_show: For domains to show specific data for an interrupt in debugfs. * * Functions below are provided by the driver and called whenever a new mapping @@ -96,6 +116,7 @@ struct irq_domain_ops { void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data); int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type); + int (*get_fwspec_info)(struct irq_fwspec *fwspec, struct irq_fwspec_info *info); #endif #ifdef CONFIG_GENERIC_IRQ_DEBUGFS void (*debug_show)(struct seq_file *m, struct irq_domain *d, @@ -602,6 +623,8 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_bas int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq); +int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info); + static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY; @@ -685,6 +708,10 @@ static inline bool irq_domain_is_msi_device(struct irq_domain *domain) return false; } +static inline int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) +{ + return -EINVAL; +} #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_MSI_IRQ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index dc473faadcc8..2652c4cfd877 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -867,13 +867,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, } EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec); -unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) +static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec) { struct irq_domain *domain; - struct irq_data *irq_data; - irq_hw_number_t hwirq; - unsigned int type = IRQ_TYPE_NONE; - int virq; if (fwspec->fwnode) { domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED); @@ -883,6 +879,32 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) domain = irq_default_domain; } + return domain; +} + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) +{ + struct irq_domain *domain = fwspec_to_domain(fwspec); + + memset(info, 0, sizeof(*info)); + + if (!domain || !domain->ops->get_fwspec_info) + return 0; + + return domain->ops->get_fwspec_info(fwspec, info); +} +#endif + +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) +{ + unsigned int type = IRQ_TYPE_NONE; + struct irq_domain *domain; + struct irq_data *irq_data; + irq_hw_number_t hwirq; + int virq; + + domain = fwspec_to_domain(fwspec); if (!domain) { pr_warn("no irq domain found for %s !\n", of_node_full_name(to_of_node(fwspec->fwnode))); -- cgit v1.2.3 From 5324fe21ba9b77b299c02191645a97777cdd73ac Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:19 +0100 Subject: ACPI: irq: Add interrupt affinity reporting interface Plug the irq_populate_fwspec_info() helper into the ACPI layer to offer an interrupt affinity reporting function. This is currently only supported for the CONFIG_ACPI_GENERIC_GSI configurations, but could later be extended to legacy architectures if necessary. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Reviewed-by: Jonathan Cameron Acked-by: Rafael J. Wysocki (Intel) Link: https://patch.msgid.link/20251020122944.3074811-3-maz@kernel.org --- drivers/acpi/irq.c | 19 +++++++++++++++++++ include/linux/acpi.h | 7 +++++++ 2 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c index 76a856c32c4d..d1595156c86a 100644 --- a/drivers/acpi/irq.c +++ b/drivers/acpi/irq.c @@ -300,6 +300,25 @@ int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res) } EXPORT_SYMBOL_GPL(acpi_irq_get); +const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, + unsigned int index) +{ + struct irq_fwspec_info info; + struct irq_fwspec fwspec; + unsigned long flags; + + if (acpi_irq_parse_one(handle, index, &fwspec, &flags)) + return NULL; + + if (irq_populate_fwspec_info(&fwspec, &info)) + return NULL; + + if (!(info.flags & IRQ_FWSPEC_INFO_AFFINITY_VALID)) + return NULL; + + return info.affinity; +} + /** * acpi_set_irq_model - Setup the GSI irqdomain information * @model: the value assigned to acpi_irq_model diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead..607db773b672 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1509,12 +1509,19 @@ static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console) #if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI) int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res); +const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, + unsigned int index); #else static inline int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res) { return -EINVAL; } +static inline const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, + unsigned int index) +{ + return NULL; +} #endif #ifdef CONFIG_ACPI_LPIT -- cgit v1.2.3 From 5404f5c06dd41fd4445a01dec77a629e254a62e8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:20 +0100 Subject: of/irq: Add interrupt affinity reporting interface Plug the irq_populate_fwspec_info() helper into the OF layer to offer an interrupt affinity reporting function. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251020122944.3074811-4-maz@kernel.org --- drivers/of/irq.c | 20 ++++++++++++++++++++ include/linux/of_irq.h | 7 +++++++ 2 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 65c3c23255b7..168fde921bd2 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -479,6 +479,26 @@ out: } EXPORT_SYMBOL_GPL(of_irq_get); +const struct cpumask *of_irq_get_affinity(struct device_node *dev, int index) +{ + struct of_phandle_args oirq; + struct irq_fwspec_info info; + struct irq_fwspec fwspec; + int rc; + + rc = of_irq_parse_one(dev, index, &oirq); + if (rc) + return NULL; + + of_phandle_args_to_fwspec(oirq.np, oirq.args, oirq.args_count, + &fwspec); + + if (irq_populate_fwspec_info(&fwspec, &info)) + return NULL; + + return info.affinity; +} + /** * of_irq_get_byname - Decode a node's IRQ and return it as a Linux IRQ number * @dev: pointer to device tree node diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 1db8543dfc8a..1c2bc0281807 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -43,6 +43,8 @@ extern int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_args *out_irq); extern int of_irq_count(struct device_node *dev); extern int of_irq_get(struct device_node *dev, int index); +extern const struct cpumask *of_irq_get_affinity(struct device_node *dev, + int index); extern int of_irq_get_byname(struct device_node *dev, const char *name); extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); @@ -76,6 +78,11 @@ static inline int of_irq_get_byname(struct device_node *dev, const char *name) { return 0; } +static inline const struct cpumask *of_irq_get_affinity(struct device_node *dev, + int index) +{ + return NULL; +} static inline int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs) { -- cgit v1.2.3 From 0d5daa938c94b8b9183e9b257a88dc0929d59409 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:21 +0100 Subject: platform: Add firmware-agnostic irq and affinity retrieval interface Expand platform_get_irq_optional() to also return an affinity if available, renaming it to platform_get_irq_affinity() in the process. platform_get_irq_optional() is preserved with its current semantics by calling into the new helper with a NULL affinity pointer. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251020122944.3074811-5-maz@kernel.org --- drivers/base/platform.c | 71 ++++++++++++++++++++++++++++++++--------- include/linux/platform_device.h | 2 ++ 2 files changed, 58 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 09450349cf32..b45d41b018ca 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -150,25 +150,37 @@ devm_platform_ioremap_resource_byname(struct platform_device *pdev, EXPORT_SYMBOL_GPL(devm_platform_ioremap_resource_byname); #endif /* CONFIG_HAS_IOMEM */ +static const struct cpumask *get_irq_affinity(struct platform_device *dev, + unsigned int num) +{ + const struct cpumask *mask = NULL; +#ifndef CONFIG_SPARC + struct fwnode_handle *fwnode = dev_fwnode(&dev->dev); + + if (is_of_node(fwnode)) + mask = of_irq_get_affinity(to_of_node(fwnode), num); + else if (is_acpi_device_node(fwnode)) + mask = acpi_irq_get_affinity(ACPI_HANDLE_FWNODE(fwnode), num); +#endif + + return mask ?: cpu_possible_mask; +} + /** - * platform_get_irq_optional - get an optional IRQ for a device - * @dev: platform device - * @num: IRQ number index + * platform_get_irq_affinity - get an optional IRQ and its affinity for a device + * @dev: platform device + * @num: interrupt number index + * @affinity: optional cpumask pointer to get the affinity of a per-cpu interrupt * - * Gets an IRQ for a platform device. Device drivers should check the return - * value for errors so as to not pass a negative integer value to the - * request_irq() APIs. This is the same as platform_get_irq(), except that it - * does not print an error message if an IRQ can not be obtained. - * - * For example:: - * - * int irq = platform_get_irq_optional(pdev, 0); - * if (irq < 0) - * return irq; + * Gets an interupt for a platform device. Device drivers should check the + * return value for errors so as to not pass a negative integer value to + * the request_irq() APIs. Optional affinity information is provided in the + * affinity pointer if available, and NULL otherwise. * - * Return: non-zero IRQ number on success, negative error number on failure. + * Return: non-zero interrupt number on success, negative error number on failure. */ -int platform_get_irq_optional(struct platform_device *dev, unsigned int num) +int platform_get_irq_affinity(struct platform_device *dev, unsigned int num, + const struct cpumask **affinity) { int ret; #ifdef CONFIG_SPARC @@ -236,8 +248,37 @@ out_not_found: out: if (WARN(!ret, "0 is an invalid IRQ number\n")) return -EINVAL; + + if (ret > 0 && affinity) + *affinity = get_irq_affinity(dev, num); + return ret; } +EXPORT_SYMBOL_GPL(platform_get_irq_affinity); + +/** + * platform_get_irq_optional - get an optional interrupt for a device + * @dev: platform device + * @num: interrupt number index + * + * Gets an interrupt for a platform device. Device drivers should check the + * return value for errors so as to not pass a negative integer value to + * the request_irq() APIs. This is the same as platform_get_irq(), except + * that it does not print an error message if an interrupt can not be + * obtained. + * + * For example:: + * + * int irq = platform_get_irq_optional(pdev, 0); + * if (irq < 0) + * return irq; + * + * Return: non-zero interrupt number on success, negative error number on failure. + */ +int platform_get_irq_optional(struct platform_device *dev, unsigned int num) +{ + return platform_get_irq_affinity(dev, num, NULL); +} EXPORT_SYMBOL_GPL(platform_get_irq_optional); /** diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 074754c23d33..ad66333ce85c 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -102,6 +102,8 @@ devm_platform_ioremap_resource_byname(struct platform_device *pdev, extern int platform_get_irq(struct platform_device *, unsigned int); extern int platform_get_irq_optional(struct platform_device *, unsigned int); +extern int platform_get_irq_affinity(struct platform_device *, unsigned int, + const struct cpumask **); extern int platform_irq_count(struct platform_device *); extern int devm_platform_get_irqs_affinity(struct platform_device *dev, struct irq_affinity *affd, -- cgit v1.2.3 From 5ff78c8de9d83ad6fc0553bf8f2edc816385837d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:28 +0100 Subject: genirq: Kill handle_percpu_devid_fasteoi_nmi() There is no in-tree user of this flow handler anymore, so simply remove it. Suggested-by: Will Deacon Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-12-maz@kernel.org --- include/linux/irq.h | 1 - kernel/irq/chip.c | 25 ------------------------- 2 files changed, 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index c67e76fbcc07..b728c18f6ded 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -655,7 +655,6 @@ extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern void handle_fasteoi_nmi(struct irq_desc *desc); -extern void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); extern int irq_chip_pm_get(struct irq_data *data); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3ffa0d80ddd1..633e1f67bb6f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -929,31 +929,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } -/** - * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu - * dev ids - * @desc: the interrupt description structure for this irq - * - * Similar to handle_fasteoi_nmi, but handling the dev_id cookie - * as a percpu pointer. - */ -void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; - unsigned int irq = irq_desc_get_irq(desc); - irqreturn_t res; - - __kstat_incr_irqs_this_cpu(desc); - - trace_irq_handler_entry(irq, action); - res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); - trace_irq_handler_exit(irq, action, res); - - if (chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); -} - static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) -- cgit v1.2.3 From 5c2b2cc472e015e79c4f0170893a1e0883bd3bb4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:29 +0100 Subject: genirq: Merge irqaction::{dev_id,percpu_dev_id} When irqaction::percpu_dev_id was introduced, it was hoped that it could be part of an anonymous union with dev_id, as the two fields are mutually exclusive. However, toolchains used at the time were often showing terrible support for anonymous unions, breaking the build on a number of architectures. It was therefore decided to keep the two fields separate and address this down the line. 14 years later, the compiler dark age is over, and there is universal support for anonymous unions. Get a whole pointer back that can immediately be spent on something else. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-13-maz@kernel.org --- include/linux/interrupt.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 51b6484c0493..0ec1a71ab4e8 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -121,8 +121,10 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); */ struct irqaction { irq_handler_t handler; - void *dev_id; - void __percpu *percpu_dev_id; + union { + void *dev_id; + void __percpu *percpu_dev_id; + }; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; -- cgit v1.2.3 From 258e7d28a3dcd389239f9688058140c1a418b549 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:31 +0100 Subject: genirq: Add affinity to percpu_devid interrupt requests Add an affinity field to both the irqaction structure and the interrupt request primitives. Nothing is making use of it yet, and the only value used it NULL, which is used as a shorthand for cpu_possible_mask. This will shortly get used with actual affinities. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-15-maz@kernel.org --- include/linux/interrupt.h | 5 +++-- kernel/irq/manage.c | 14 ++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 0ec1a71ab4e8..52147d5f432b 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -125,6 +125,7 @@ struct irqaction { void *dev_id; void __percpu *percpu_dev_id; }; + const struct cpumask *affinity; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; @@ -181,7 +182,7 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler, extern int __must_check __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, - void __percpu *percpu_dev_id); + const cpumask_t *affinity, void __percpu *percpu_dev_id); extern int __must_check request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags, @@ -192,7 +193,7 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id) { return __request_percpu_irq(irq, handler, 0, - devname, percpu_dev_id); + devname, NULL, percpu_dev_id); } extern int __must_check diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d9ddc30678b5..5f4c65167743 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2444,10 +2444,14 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) static struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags, - const char *devname, void __percpu *dev_id) + const char *devname, const cpumask_t *affinity, + void __percpu *dev_id) { struct irqaction *action; + if (!affinity) + affinity = cpu_possible_mask; + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return NULL; @@ -2456,6 +2460,7 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; + action->affinity = affinity; return action; } @@ -2466,6 +2471,7 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f * @handler: Function to be called when the IRQ occurs. * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device + * @affinity: A cpumask describing the target CPUs for this interrupt * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources, but doesn't enable the interrupt @@ -2478,7 +2484,7 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, - void __percpu *dev_id) + const cpumask_t *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -2495,7 +2501,7 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, if (flags && flags != IRQF_TIMER) return -EINVAL; - action = create_percpu_irqaction(handler, flags, devname, dev_id); + action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id); if (!action) return -ENOMEM; @@ -2560,7 +2566,7 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, return -EINVAL; action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING, - name, dev_id); + name, NULL, dev_id); if (!action) return -ENOMEM; -- cgit v1.2.3 From b9c6aa9efc71dae656f9f913d1250ea08cd6e10f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:32 +0100 Subject: genirq: Update request_percpu_nmi() to take an affinity Continue spreading the notion of affinity to the per CPU interrupt request code by updating the call sites that use request_percpu_nmi() (all two of them) to take an affinity pointer. This pointer is firmly NULL for now. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-16-maz@kernel.org --- arch/arm64/kernel/smp.c | 2 +- drivers/perf/arm_pmu.c | 2 +- include/linux/interrupt.h | 4 ++-- kernel/irq/manage.c | 12 +++++++----- 4 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 68cea3a4a35c..6fb838eee2e7 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -1094,7 +1094,7 @@ static void ipi_setup_sgi(int ipi) irq = ipi_irq_base + ipi; if (ipi_should_be_nmi(ipi)) { - err = request_percpu_nmi(irq, ipi_handler, "IPI", &irq_stat); + err = request_percpu_nmi(irq, ipi_handler, "IPI", NULL, &irq_stat); WARN(err, "Could not request IRQ %d as NMI, err=%d\n", irq, err); } else { err = request_percpu_irq(irq, ipi_handler, "IPI", &irq_stat); diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 5c310e803dd7..22c601b46c85 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -659,7 +659,7 @@ int armpmu_request_irq(int irq, int cpu) irq_ops = &pmunmi_ops; } } else if (armpmu_count_irq_users(irq) == 0) { - err = request_percpu_nmi(irq, handler, "arm-pmu", &cpu_armpmu); + err = request_percpu_nmi(irq, handler, "arm-pmu", NULL, &cpu_armpmu); /* If cannot get an NMI, get a normal interrupt */ if (err) { diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 52147d5f432b..81506ab759b8 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -197,8 +197,8 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler, } extern int __must_check -request_percpu_nmi(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *dev); +request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, + const struct cpumask *affinity, void __percpu *dev_id); extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5f4c65167743..b1a3140e5f3c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2527,6 +2527,7 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq); * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @name: An ascii name for the claiming device + * @affinity: A cpumask describing the target CPUs for this interrupt * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs @@ -2543,8 +2544,8 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq); * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ -int request_percpu_nmi(unsigned int irq, irq_handler_t handler, - const char *name, void __percpu *dev_id) +int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, + const struct cpumask *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -2561,12 +2562,13 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, !irq_supports_nmi(desc)) return -EINVAL; - /* The line cannot already be NMI */ - if (irq_is_nmi(desc)) + /* The line cannot be NMI already if the new request covers all CPUs */ + if (irq_is_nmi(desc) && + (!affinity || cpumask_equal(affinity, cpu_possible_mask))) return -EINVAL; action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING, - name, NULL, dev_id); + name, affinity, dev_id); if (!action) return -ENOMEM; -- cgit v1.2.3 From c734af3b2b95f0ac6ed87c50e7602a6beeaf534f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:34 +0100 Subject: genirq: Add request_percpu_irq_affinity() helper While it would be nice to simply make request_percpu_irq() take an affinity mask, the churn is likely to be on the irritating side given that most drivers do not give a damn about affinities. So take the more innocuous path to provide a helper that parallels request_percpu_irq(), with an affinity as a bonus argument. Yes, request_percpu_irq_affinity() is a bit of a mouthful. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-18-maz@kernel.org --- include/linux/interrupt.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 81506ab759b8..fa62ab556ee3 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -196,6 +196,15 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler, devname, NULL, percpu_dev_id); } +static inline int __must_check +request_percpu_irq_affinity(unsigned int irq, irq_handler_t handler, + const char *devname, const cpumask_t *affinity, + void __percpu *percpu_dev_id) +{ + return __request_percpu_irq(irq, handler, 0, + devname, affinity, percpu_dev_id); +} + extern int __must_check request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, const struct cpumask *affinity, void __percpu *dev_id); -- cgit v1.2.3 From 54b350fa8e965dc59622698e2a18d6bf73944bf4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 20 Oct 2025 13:29:35 +0100 Subject: perf: arm_pmu: Request specific affinities for per CPU NMIs/interrupts Let the PMU driver request both NMIs and normal interrupts with an affinity mask matching the PMU affinity. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-19-maz@kernel.org --- drivers/perf/arm_pmu.c | 44 ++++++++++++++++++++++++----------------- drivers/perf/arm_pmu_acpi.c | 2 +- drivers/perf/arm_pmu_platform.c | 4 ++-- include/linux/perf/arm_pmu.h | 4 ++-- 4 files changed, 31 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 22c601b46c85..959ceb3d1f55 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -26,7 +26,8 @@ #include -static int armpmu_count_irq_users(const int irq); +static int armpmu_count_irq_users(const struct cpumask *affinity, + const int irq); struct pmu_irq_ops { void (*enable_pmuirq)(unsigned int irq); @@ -64,7 +65,9 @@ static void armpmu_enable_percpu_pmuirq(unsigned int irq) static void armpmu_free_percpu_pmuirq(unsigned int irq, int cpu, void __percpu *devid) { - if (armpmu_count_irq_users(irq) == 1) + struct arm_pmu *armpmu = *per_cpu_ptr((void * __percpu *)devid, cpu); + + if (armpmu_count_irq_users(&armpmu->supported_cpus, irq) == 1) free_percpu_irq(irq, devid); } @@ -89,7 +92,9 @@ static void armpmu_disable_percpu_pmunmi(unsigned int irq) static void armpmu_free_percpu_pmunmi(unsigned int irq, int cpu, void __percpu *devid) { - if (armpmu_count_irq_users(irq) == 1) + struct arm_pmu *armpmu = *per_cpu_ptr((void * __percpu *)devid, cpu); + + if (armpmu_count_irq_users(&armpmu->supported_cpus, irq) == 1) free_percpu_nmi(irq, devid); } @@ -580,11 +585,11 @@ static const struct attribute_group armpmu_common_attr_group = { .attrs = armpmu_common_attrs, }; -static int armpmu_count_irq_users(const int irq) +static int armpmu_count_irq_users(const struct cpumask *affinity, const int irq) { int cpu, count = 0; - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, affinity) { if (per_cpu(cpu_irq, cpu) == irq) count++; } @@ -592,12 +597,13 @@ static int armpmu_count_irq_users(const int irq) return count; } -static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq) +static const struct pmu_irq_ops * +armpmu_find_irq_ops(const struct cpumask *affinity, int irq) { const struct pmu_irq_ops *ops = NULL; int cpu; - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, affinity) { if (per_cpu(cpu_irq, cpu) != irq) continue; @@ -609,22 +615,25 @@ static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq) return ops; } -void armpmu_free_irq(int irq, int cpu) +void armpmu_free_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu) { if (per_cpu(cpu_irq, cpu) == 0) return; if (WARN_ON(irq != per_cpu(cpu_irq, cpu))) return; - per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, &cpu_armpmu); + per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, armpmu); per_cpu(cpu_irq, cpu) = 0; per_cpu(cpu_irq_ops, cpu) = NULL; } -int armpmu_request_irq(int irq, int cpu) +int armpmu_request_irq(struct arm_pmu * __percpu *pcpu_armpmu, int irq, int cpu) { int err = 0; + struct arm_pmu **armpmu = per_cpu_ptr(pcpu_armpmu, cpu); + const struct cpumask *affinity = *armpmu ? &(*armpmu)->supported_cpus : + cpu_possible_mask; /* ACPI */ const irq_handler_t handler = armpmu_dispatch_irq; const struct pmu_irq_ops *irq_ops; @@ -646,25 +655,24 @@ int armpmu_request_irq(int irq, int cpu) IRQF_NOBALANCING | IRQF_NO_AUTOEN | IRQF_NO_THREAD; - err = request_nmi(irq, handler, irq_flags, "arm-pmu", - per_cpu_ptr(&cpu_armpmu, cpu)); + err = request_nmi(irq, handler, irq_flags, "arm-pmu", armpmu); /* If cannot get an NMI, get a normal interrupt */ if (err) { err = request_irq(irq, handler, irq_flags, "arm-pmu", - per_cpu_ptr(&cpu_armpmu, cpu)); + armpmu); irq_ops = &pmuirq_ops; } else { has_nmi = true; irq_ops = &pmunmi_ops; } - } else if (armpmu_count_irq_users(irq) == 0) { - err = request_percpu_nmi(irq, handler, "arm-pmu", NULL, &cpu_armpmu); + } else if (armpmu_count_irq_users(affinity, irq) == 0) { + err = request_percpu_nmi(irq, handler, "arm-pmu", affinity, pcpu_armpmu); /* If cannot get an NMI, get a normal interrupt */ if (err) { - err = request_percpu_irq(irq, handler, "arm-pmu", - &cpu_armpmu); + err = request_percpu_irq_affinity(irq, handler, "arm-pmu", + affinity, pcpu_armpmu); irq_ops = &percpu_pmuirq_ops; } else { has_nmi = true; @@ -672,7 +680,7 @@ int armpmu_request_irq(int irq, int cpu) } } else { /* Per cpudevid irq was already requested by another CPU */ - irq_ops = armpmu_find_irq_ops(irq); + irq_ops = armpmu_find_irq_ops(affinity, irq); if (WARN_ON(!irq_ops)) err = -EINVAL; diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c index 05dda19c5359..e80f76d95e68 100644 --- a/drivers/perf/arm_pmu_acpi.c +++ b/drivers/perf/arm_pmu_acpi.c @@ -218,7 +218,7 @@ static int arm_pmu_acpi_parse_irqs(void) * them with their PMUs. */ per_cpu(pmu_irqs, cpu) = irq; - err = armpmu_request_irq(irq, cpu); + err = armpmu_request_irq(&probed_pmus, irq, cpu); if (err) goto out_err; } diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c index 9c0494d8a867..1c9e50a13201 100644 --- a/drivers/perf/arm_pmu_platform.c +++ b/drivers/perf/arm_pmu_platform.c @@ -165,7 +165,7 @@ static int armpmu_request_irqs(struct arm_pmu *armpmu) if (!irq) continue; - err = armpmu_request_irq(irq, cpu); + err = armpmu_request_irq(&hw_events->percpu_pmu, irq, cpu); if (err) break; } @@ -181,7 +181,7 @@ static void armpmu_free_irqs(struct arm_pmu *armpmu) for_each_cpu(cpu, &armpmu->supported_cpus) { int irq = per_cpu(hw_events->irq, cpu); - armpmu_free_irq(irq, cpu); + armpmu_free_irq(&hw_events->percpu_pmu, irq, cpu); } } diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 93c9a26492fc..6690bd77aa4e 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -190,8 +190,8 @@ bool arm_pmu_irq_is_nmi(void); struct arm_pmu *armpmu_alloc(void); void armpmu_free(struct arm_pmu *pmu); int armpmu_register(struct arm_pmu *pmu); -int armpmu_request_irq(int irq, int cpu); -void armpmu_free_irq(int irq, int cpu); +int armpmu_request_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu); +void armpmu_free_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu); #define ARMV8_PMU_PDEV_NAME "armv8-pmu" -- cgit v1.2.3 From c620438ef2ac80b09269a9ae3c0b4fe5add19bfe Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:40 +0100 Subject: irqchip: Kill irq-partition-percpu This code is now completely unused, and nobody will ever miss it. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-24-maz@kernel.org --- drivers/irqchip/Kconfig | 3 - drivers/irqchip/Makefile | 1 - drivers/irqchip/irq-partition-percpu.c | 241 --------------------------- include/linux/irqchip/irq-partition-percpu.h | 53 ------ 4 files changed, 298 deletions(-) delete mode 100644 drivers/irqchip/irq-partition-percpu.c delete mode 100644 include/linux/irqchip/irq-partition-percpu.h (limited to 'include/linux') diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 648b3618fc78..5dddb4c9442a 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -450,9 +450,6 @@ config LS_SCFG_MSI depends on PCI_MSI select IRQ_MSI_LIB -config PARTITION_PERCPU - bool - config STM32MP_EXTI tristate "STM32MP extended interrupts and event controller" depends on (ARCH_STM32 && !ARM_SINGLE_ARMV7M) || COMPILE_TEST diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 3de083f5484c..6a229443efe0 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -36,7 +36,6 @@ obj-$(CONFIG_ARM_GIC_V3) += irq-gic-v3.o irq-gic-v3-mbi.o irq-gic-common.o obj-$(CONFIG_ARM_GIC_ITS_PARENT) += irq-gic-its-msi-parent.o obj-$(CONFIG_ARM_GIC_V3_ITS) += irq-gic-v3-its.o irq-gic-v4.o obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC) += irq-gic-v3-its-fsl-mc-msi.o -obj-$(CONFIG_PARTITION_PERCPU) += irq-partition-percpu.o obj-$(CONFIG_ARM_GIC_V5) += irq-gic-v5.o irq-gic-v5-irs.o irq-gic-v5-its.o \ irq-gic-v5-iwb.o obj-$(CONFIG_HISILICON_IRQ_MBIGEN) += irq-mbigen.o diff --git a/drivers/irqchip/irq-partition-percpu.c b/drivers/irqchip/irq-partition-percpu.c deleted file mode 100644 index 4441ffe149ea..000000000000 --- a/drivers/irqchip/irq-partition-percpu.c +++ /dev/null @@ -1,241 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2016 ARM Limited, All Rights Reserved. - * Author: Marc Zyngier - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct partition_desc { - int nr_parts; - struct partition_affinity *parts; - struct irq_domain *domain; - struct irq_desc *chained_desc; - unsigned long *bitmap; - struct irq_domain_ops ops; -}; - -static bool partition_check_cpu(struct partition_desc *part, - unsigned int cpu, unsigned int hwirq) -{ - return cpumask_test_cpu(cpu, &part->parts[hwirq].mask); -} - -static void partition_irq_mask(struct irq_data *d) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - if (partition_check_cpu(part, smp_processor_id(), d->hwirq) && - chip->irq_mask) - chip->irq_mask(data); -} - -static void partition_irq_unmask(struct irq_data *d) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - if (partition_check_cpu(part, smp_processor_id(), d->hwirq) && - chip->irq_unmask) - chip->irq_unmask(data); -} - -static int partition_irq_set_irqchip_state(struct irq_data *d, - enum irqchip_irq_state which, - bool val) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - if (partition_check_cpu(part, smp_processor_id(), d->hwirq) && - chip->irq_set_irqchip_state) - return chip->irq_set_irqchip_state(data, which, val); - - return -EINVAL; -} - -static int partition_irq_get_irqchip_state(struct irq_data *d, - enum irqchip_irq_state which, - bool *val) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - if (partition_check_cpu(part, smp_processor_id(), d->hwirq) && - chip->irq_get_irqchip_state) - return chip->irq_get_irqchip_state(data, which, val); - - return -EINVAL; -} - -static int partition_irq_set_type(struct irq_data *d, unsigned int type) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - if (chip->irq_set_type) - return chip->irq_set_type(data, type); - - return -EINVAL; -} - -static void partition_irq_print_chip(struct irq_data *d, struct seq_file *p) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - seq_printf(p, "%5s-%lu", chip->name, data->hwirq); -} - -static struct irq_chip partition_irq_chip = { - .irq_mask = partition_irq_mask, - .irq_unmask = partition_irq_unmask, - .irq_set_type = partition_irq_set_type, - .irq_get_irqchip_state = partition_irq_get_irqchip_state, - .irq_set_irqchip_state = partition_irq_set_irqchip_state, - .irq_print_chip = partition_irq_print_chip, -}; - -static void partition_handle_irq(struct irq_desc *desc) -{ - struct partition_desc *part = irq_desc_get_handler_data(desc); - struct irq_chip *chip = irq_desc_get_chip(desc); - int cpu = smp_processor_id(); - int hwirq; - - chained_irq_enter(chip, desc); - - for_each_set_bit(hwirq, part->bitmap, part->nr_parts) { - if (partition_check_cpu(part, cpu, hwirq)) - break; - } - - if (unlikely(hwirq == part->nr_parts)) - handle_bad_irq(desc); - else - generic_handle_domain_irq(part->domain, hwirq); - - chained_irq_exit(chip, desc); -} - -static int partition_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg) -{ - int ret; - irq_hw_number_t hwirq; - unsigned int type; - struct irq_fwspec *fwspec = arg; - struct partition_desc *part; - - BUG_ON(nr_irqs != 1); - ret = domain->ops->translate(domain, fwspec, &hwirq, &type); - if (ret) - return ret; - - part = domain->host_data; - - set_bit(hwirq, part->bitmap); - irq_set_chained_handler_and_data(irq_desc_get_irq(part->chained_desc), - partition_handle_irq, part); - irq_set_percpu_devid_partition(virq, &part->parts[hwirq].mask); - irq_domain_set_info(domain, virq, hwirq, &partition_irq_chip, part, - handle_percpu_devid_irq, NULL, NULL); - irq_set_status_flags(virq, IRQ_NOAUTOEN); - - return 0; -} - -static void partition_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) -{ - struct irq_data *d; - - BUG_ON(nr_irqs != 1); - - d = irq_domain_get_irq_data(domain, virq); - irq_set_handler(virq, NULL); - irq_domain_reset_irq_data(d); -} - -int partition_translate_id(struct partition_desc *desc, void *partition_id) -{ - struct partition_affinity *part = NULL; - int i; - - for (i = 0; i < desc->nr_parts; i++) { - if (desc->parts[i].partition_id == partition_id) { - part = &desc->parts[i]; - break; - } - } - - if (WARN_ON(!part)) { - pr_err("Failed to find partition\n"); - return -EINVAL; - } - - return i; -} - -struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode, - struct partition_affinity *parts, - int nr_parts, - int chained_irq, - const struct irq_domain_ops *ops) -{ - struct partition_desc *desc; - struct irq_domain *d; - - BUG_ON(!ops->select || !ops->translate); - - desc = kzalloc(sizeof(*desc), GFP_KERNEL); - if (!desc) - return NULL; - - desc->ops = *ops; - desc->ops.free = partition_domain_free; - desc->ops.alloc = partition_domain_alloc; - - d = irq_domain_create_linear(fwnode, nr_parts, &desc->ops, desc); - if (!d) - goto out; - desc->domain = d; - - desc->bitmap = bitmap_zalloc(nr_parts, GFP_KERNEL); - if (WARN_ON(!desc->bitmap)) - goto out; - - desc->chained_desc = irq_to_desc(chained_irq); - desc->nr_parts = nr_parts; - desc->parts = parts; - - return desc; -out: - if (d) - irq_domain_remove(d); - kfree(desc); - - return NULL; -} - -struct irq_domain *partition_get_domain(struct partition_desc *dsc) -{ - if (dsc) - return dsc->domain; - - return NULL; -} diff --git a/include/linux/irqchip/irq-partition-percpu.h b/include/linux/irqchip/irq-partition-percpu.h deleted file mode 100644 index b35ee22c278f..000000000000 --- a/include/linux/irqchip/irq-partition-percpu.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2016 ARM Limited, All Rights Reserved. - * Author: Marc Zyngier - */ - -#ifndef __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H -#define __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H - -#include -#include -#include - -struct partition_affinity { - cpumask_t mask; - void *partition_id; -}; - -struct partition_desc; - -#ifdef CONFIG_PARTITION_PERCPU -int partition_translate_id(struct partition_desc *desc, void *partition_id); -struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode, - struct partition_affinity *parts, - int nr_parts, - int chained_irq, - const struct irq_domain_ops *ops); -struct irq_domain *partition_get_domain(struct partition_desc *dsc); -#else -static inline int partition_translate_id(struct partition_desc *desc, - void *partition_id) -{ - return -EINVAL; -} - -static inline -struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode, - struct partition_affinity *parts, - int nr_parts, - int chained_irq, - const struct irq_domain_ops *ops) -{ - return NULL; -} - -static inline -struct irq_domain *partition_get_domain(struct partition_desc *dsc) -{ - return NULL; -} -#endif - -#endif /* __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H */ -- cgit v1.2.3 From ee2d50a9f524ae829d1a8ec296d7a0170e7b8ade Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:41 +0100 Subject: genirq: Kill irq_{g,s}et_percpu_devid_partition() These two helpers do not have any user anymore, and can be removed, together with the affinity field kept in the irqdesc structure. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-25-maz@kernel.org --- include/linux/irq.h | 4 ---- include/linux/irqdesc.h | 1 - kernel/irq/irqdesc.c | 24 +----------------------- 3 files changed, 1 insertion(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index b728c18f6ded..4a9f1d7b08c3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -718,10 +718,6 @@ static inline void irq_set_chip_and_handler(unsigned int irq, } extern int irq_set_percpu_devid(unsigned int irq); -extern int irq_set_percpu_devid_partition(unsigned int irq, - const struct cpumask *affinity); -extern int irq_get_percpu_devid_partition(unsigned int irq, - struct cpumask *affinity); extern void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index fd091c35d572..37e0b5b5600a 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -82,7 +82,6 @@ struct irq_desc { int threads_handled_last; raw_spinlock_t lock; struct cpumask *percpu_enabled; - const struct cpumask *percpu_affinity; #ifdef CONFIG_SMP const struct cpumask *affinity_hint; struct irq_affinity_notify *affinity_notify; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index db714d3014b5..6acf268f005b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -879,8 +879,7 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) chip_bus_sync_unlock(desc); } -int irq_set_percpu_devid_partition(unsigned int irq, - const struct cpumask *affinity) +int irq_set_percpu_devid(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -892,31 +891,10 @@ int irq_set_percpu_devid_partition(unsigned int irq, if (!desc->percpu_enabled) return -ENOMEM; - desc->percpu_affinity = affinity ? : cpu_possible_mask; - irq_set_percpu_devid_flags(irq); return 0; } -int irq_set_percpu_devid(unsigned int irq) -{ - return irq_set_percpu_devid_partition(irq, NULL); -} - -int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || !desc->percpu_enabled) - return -EINVAL; - - if (affinity) - cpumask_copy(affinity, desc->percpu_affinity); - - return 0; -} -EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); - void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); -- cgit v1.2.3 From ebac4649fcadc6047030810326875c6e612c7b2f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:42 +0100 Subject: irqdomain: Kill of_node_to_fwnode() helper There is no in-tree users of this helper since b13b41cc3dc18 ("misc: ti_fpc202: Switch to of_fwnode_handle()"), and is replaced with of_fwnode_handle(). Get rid of it. Suggested-by: Jonathan Cameron Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-26-maz@kernel.org --- include/linux/irqdomain.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 9d6a5e99394f..5907baf6099d 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -730,12 +730,6 @@ static inline void msi_device_domain_free_wired(struct irq_domain *domain, unsig } #endif -/* Deprecated functions. Will be removed in the merge window */ -static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) -{ - return node ? &node->fwnode : NULL; -} - static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) -- cgit v1.2.3 From fa9d2777387346645a40ab37cfb0c37b3ef40cc9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:43 +0100 Subject: perf: arm_pmu: Kill last use of per-CPU cpu_armpmu pointer Having removed the use of the cpu_armpmu per-CPU variable from the interrupt handling, the only user left is the BRBE scheduler hook. It is easy to drop the use of this variable by following the pointer to the generic PMU structure, and get the arm_pmu structure from there. Perform the conversion and kill cpu_armpmu altogether. Suggested-by: Will Deacon Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-27-maz@kernel.org --- drivers/perf/arm_pmu.c | 5 ----- drivers/perf/arm_pmuv3.c | 2 +- include/linux/perf/arm_pmu.h | 2 -- 3 files changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 959ceb3d1f55..f7abd1333963 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -104,7 +104,6 @@ static const struct pmu_irq_ops percpu_pmunmi_ops = { .free_pmuirq = armpmu_free_percpu_pmunmi }; -DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu); static DEFINE_PER_CPU(int, cpu_irq); static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops); @@ -725,8 +724,6 @@ static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node) if (pmu->reset) pmu->reset(pmu); - per_cpu(cpu_armpmu, cpu) = pmu; - irq = armpmu_get_cpu_irq(pmu, cpu); if (irq) per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq); @@ -746,8 +743,6 @@ static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node) if (irq) per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq); - per_cpu(cpu_armpmu, cpu) = NULL; - return 0; } diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 69c5cc8f5606..ca8d706d4022 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -1064,7 +1064,7 @@ static int armv8pmu_user_event_idx(struct perf_event *event) static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, struct task_struct *task, bool sched_in) { - struct arm_pmu *armpmu = *this_cpu_ptr(&cpu_armpmu); + struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); if (!hw_events->branch_users) diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 6690bd77aa4e..bab26a7d79f4 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -132,8 +132,6 @@ struct arm_pmu { #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu)) -DECLARE_PER_CPU(struct arm_pmu *, cpu_armpmu); - u64 armpmu_event_update(struct perf_event *event); int armpmu_event_set_period(struct perf_event *event); -- cgit v1.2.3 From 531b87d865eb9e625c2e46ec8f06a65a6157ee45 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 26 Oct 2025 20:38:45 +0000 Subject: bpf: widen dynptr size/offset to 64 bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dynptr currently caps size and offset at 24 bits, which isn’t sufficient for file-backed use cases; even 32 bits can be limiting. Refactor dynptr helpers/kfuncs to use 64-bit size and offset, ensuring consistency across the APIs. This change does not affect internals of xdp, skb or other dynptrs, which continue to behave as before. Also it does not break binary compatibility. The widening enables large-file access support via dynptr, implemented in the next patches. Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251026203853.135105-3-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 20 +++---- include/uapi/linux/bpf.h | 8 +-- kernel/bpf/helpers.c | 66 +++++++++++----------- kernel/trace/bpf_trace.c | 46 +++++++-------- tools/include/uapi/linux/bpf.h | 8 +-- tools/testing/selftests/bpf/bpf_kfuncs.h | 12 ++-- tools/testing/selftests/bpf/progs/dynptr_success.c | 12 ++-- 7 files changed, 86 insertions(+), 86 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e53cda0aabb6..907c69295293 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1387,19 +1387,19 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_SKB_META, }; -int bpf_dynptr_check_size(u32 size); -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); +int bpf_dynptr_check_size(u64 size); +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len); +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len); bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); -int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, - void *src, u32 len, u64 flags); -void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk); +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, + void *src, u64 len, u64 flags); +void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk); -static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len) { - u32 size = __bpf_dynptr_size(ptr); + u64 size = __bpf_dynptr_size(ptr); if (len > size || offset > size - len) return -E2BIG; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6829936d33f5..77edd0253989 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5618,7 +5618,7 @@ union bpf_attr { * Return * *sk* if casting is valid, or **NULL** otherwise. * - * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr) * Description * Get a dynptr to local memory *data*. * @@ -5661,7 +5661,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5671,7 +5671,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5692,7 +5692,7 @@ union bpf_attr { * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * - * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len) * Description * Get a pointer to the underlying dynptr data. * diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b9ec6ee21c94..a2ce17ea5edb 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1684,19 +1684,19 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } -static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) +static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; - ptr->size = new_size | metadata; + ptr->size = (u32)new_size | metadata; } -int bpf_dynptr_check_size(u32 size) +int bpf_dynptr_check_size(u64 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } @@ -1715,7 +1715,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) +BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; @@ -1750,8 +1750,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, - u32 offset, u64 flags) +static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src, + u64 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1787,8 +1787,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s } } -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src, + u64, offset, u64, flags) { return __bpf_dynptr_read(dst, len, src, offset, flags); } @@ -1804,8 +1804,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, - u32 len, u64 flags) +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, + u64 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1848,8 +1848,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, } } -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src, + u64, len, u64, flags) { return __bpf_dynptr_write(dst, offset, src, len, flags); } @@ -1865,7 +1865,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len) { enum bpf_dynptr_type type; int err; @@ -2680,12 +2680,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; - u32 len = buffer__szk; + u64 len = buffer__szk; int err; if (!ptr->data) @@ -2767,8 +2767,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2800,10 +2800,10 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end) +__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 size; + u64 size; if (!ptr->data || start > end) return -EINVAL; @@ -2836,7 +2836,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) return __bpf_dynptr_is_rdonly(ptr); } -__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p) +__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2873,14 +2873,14 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, - struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, + struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; - u32 off; + u64 off; src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); @@ -2902,7 +2902,7 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, off = 0; while (off < size) { - u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + u64 chunk_sz = min_t(u64, sizeof(buf), size - off); int err; err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); @@ -2928,10 +2928,10 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ - __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val) - { +__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) +{ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 chunk_sz, write_off; + u64 chunk_sz, write_off; char buf[256]; void* slice; int err; @@ -2950,11 +2950,11 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return err; /* Non-linear data under the dynptr, write from a local buffer */ - chunk_sz = min_t(u32, sizeof(buf), size); + chunk_sz = min_t(u64, sizeof(buf), size); memset(buf, val, chunk_sz); for (write_off = 0; write_off < size; write_off += chunk_sz) { - chunk_sz = min_t(u32, sizeof(buf), size - write_off); + chunk_sz = min_t(u64, sizeof(buf), size - write_off); err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); if (err) return err; @@ -4469,7 +4469,7 @@ late_initcall(kfunc_init); /* Get a pointer to dynptr data up to len bytes for read only access. If * the dynptr doesn't have continuous data up to len bytes, return NULL. */ -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len) { const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; @@ -4480,7 +4480,7 @@ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) * the dynptr doesn't have continuous data up to len bytes, or the dynptr * is read only, return NULL. */ -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len) { if (__bpf_dynptr_is_rdonly(ptr)) return NULL; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4f87c16d915a..a795f7afbf3d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3372,13 +3372,13 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ -static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; - u32 chunk_sz, off; + u64 chunk_sz, off; void *dst_slice; int cnt, err; char buf[256]; @@ -3392,7 +3392,7 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do return -E2BIG; for (off = 0; off < size; off += chunk_sz - 1) { - chunk_sz = min_t(u32, sizeof(buf), size - off); + chunk_sz = min_t(u64, sizeof(buf), size - off); /* Expect str_copy_fn to return count of copied bytes, including * zero terminator. Next iteration increment off by chunk_sz - 1 to * overwrite NUL. @@ -3409,14 +3409,14 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do return off; } -static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, - u32 size, const void *unsafe_src, +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff, + u64 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; - u32 off, chunk_sz; + u64 off, chunk_sz; int err; dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); @@ -3428,7 +3428,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 return -E2BIG; for (off = 0; off < size; off += chunk_sz) { - chunk_sz = min_t(u32, sizeof(buf), size - off); + chunk_sz = min_t(u64, sizeof(buf), size - off); err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); if (err) return err; @@ -3514,58 +3514,58 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } -__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } -__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign, +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, tsk); } -__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign, +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6829936d33f5..77edd0253989 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5618,7 +5618,7 @@ union bpf_attr { * Return * *sk* if casting is valid, or **NULL** otherwise. * - * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr) * Description * Get a dynptr to local memory *data*. * @@ -5661,7 +5661,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5671,7 +5671,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5692,7 +5692,7 @@ union bpf_attr { * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * - * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len) * Description * Get a pointer to the underlying dynptr data. * diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 794d44d19c88..e0189254bb6e 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -28,8 +28,8 @@ extern int bpf_dynptr_from_skb_meta(struct __sk_buff *skb, __u64 flags, * Either a direct pointer to the dynptr data or a pointer to the user-provided * buffer if unable to obtain a direct pointer */ -extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, - void *buffer, __u32 buffer__szk) __ksym __weak; +extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u64 offset, + void *buffer, __u64 buffer__szk) __ksym __weak; /* Description * Obtain a read-write pointer to the dynptr's data @@ -37,13 +37,13 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, * Either a direct pointer to the dynptr data or a pointer to the user-provided * buffer if unable to obtain a direct pointer */ -extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset, - void *buffer, __u32 buffer__szk) __ksym __weak; +extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u64 offset, void *buffer, + __u64 buffer__szk) __ksym __weak; -extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u32 start, __u32 end) __ksym __weak; +extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak; extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak; extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak; -extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; +extern __u64 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym __weak; /* Description diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index 127dea342e5a..e0d672d93adf 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -914,8 +914,8 @@ void *user_ptr; char expected_str[384]; __u32 test_len[7] = {0/* placeholder */, 0, 1, 2, 255, 256, 257}; -typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr); +typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr); /* Returns the offset just before the end of the maximum sized xdp fragment. * Any write larger than 32 bytes will be split between 2 fragments. @@ -1106,16 +1106,16 @@ int test_copy_from_user_str_dynptr(void *ctx) return 0; } -static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr) +static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); return bpf_copy_from_user_task_dynptr(dptr, off, size, unsafe_ptr, task); } -static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr) +static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); -- cgit v1.2.3 From 76e4fed847124690f7344a43d01dbcd7b2925353 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 26 Oct 2025 20:38:46 +0000 Subject: lib: move freader into buildid.h Move struct freader and prototypes of the functions operating on it into the buildid.h. This allows reusing freader outside buildid, e.g. for file dynptr support added later. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20251026203853.135105-4-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 1 + include/linux/buildid.h | 25 +++++++++++++++++++++++++ lib/buildid.c | 29 +++++------------------------ 3 files changed, 31 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 545a4776795e..7564692f2f3c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4648,6 +4648,7 @@ F: Documentation/userspace-api/ebpf/ F: arch/*/net/* F: include/linux/bpf* F: include/linux/btf* +F: include/linux/buildid.h F: include/linux/filter.h F: include/trace/events/xdp.h F: include/uapi/linux/bpf* diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 014a88c41073..831c1b4b626c 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -18,4 +18,29 @@ void init_vmlinux_build_id(void); static inline void init_vmlinux_build_id(void) { } #endif +struct freader { + void *buf; + u32 buf_sz; + int err; + union { + struct { + struct file *file; + struct folio *folio; + void *addr; + loff_t folio_off; + bool may_fault; + }; + struct { + const char *data; + u64 data_sz; + }; + }; +}; + +void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, + struct file *file, bool may_fault); +void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz); +const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz); +void freader_cleanup(struct freader *r); + #endif diff --git a/lib/buildid.c b/lib/buildid.c index c4b0f376fb34..df06e492810d 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -11,27 +11,8 @@ #define MAX_PHDR_CNT 256 -struct freader { - void *buf; - u32 buf_sz; - int err; - union { - struct { - struct file *file; - struct folio *folio; - void *addr; - loff_t folio_off; - bool may_fault; - }; - struct { - const char *data; - u64 data_sz; - }; - }; -}; - -static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, - struct file *file, bool may_fault) +void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, + struct file *file, bool may_fault) { memset(r, 0, sizeof(*r)); r->buf = buf; @@ -40,7 +21,7 @@ static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, r->may_fault = may_fault; } -static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz) +void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz) { memset(r, 0, sizeof(*r)); r->data = data; @@ -92,7 +73,7 @@ static int freader_get_folio(struct freader *r, loff_t file_off) return 0; } -static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) +const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) { size_t folio_sz; @@ -147,7 +128,7 @@ static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) return r->addr + (file_off - r->folio_off); } -static void freader_cleanup(struct freader *r) +void freader_cleanup(struct freader *r) { if (!r->buf) return; /* non-file-backed mode */ -- cgit v1.2.3 From 8d8771dc03e48300e80b43744dd3c320ccaf746a Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 26 Oct 2025 20:38:49 +0000 Subject: bpf: add plumbing for file-backed dynptr Add the necessary verifier plumbing for the new file-backed dynptr type. Introduce two kfuncs for its lifecycle management: * bpf_dynptr_from_file() for initialization * bpf_dynptr_file_discard() for destruction Currently there is no mechanism for kfunc to release dynptr, this patch add one: * Dynptr release function sets meta->release_regno * Call unmark_stack_slots_dynptr() if meta->release_regno is set and dynptr ref_obj_id is set as well. Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251026203853.135105-7-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++++++- kernel/bpf/helpers.c | 12 ++++++++++++ kernel/bpf/log.c | 2 ++ kernel/bpf/verifier.c | 31 +++++++++++++++++++++++++------ 4 files changed, 45 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 907c69295293..14f800773997 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -792,12 +792,15 @@ enum bpf_type_flag { /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to file */ + DYNPTR_TYPE_FILE = BIT(20 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ - | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META) + | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META | DYNPTR_TYPE_FILE) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1385,6 +1388,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_XDP, /* Points to skb_metadata_end()-skb_metadata_len() */ BPF_DYNPTR_TYPE_SKB_META, + /* Underlying data is a file */ + BPF_DYNPTR_TYPE_FILE, }; int bpf_dynptr_check_size(u64 size); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a2ce17ea5edb..bf65b7fb761f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4252,6 +4252,16 @@ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct b return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } +__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) +{ + return 0; +} + +__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) +{ + return 0; +} + __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) @@ -4429,6 +4439,8 @@ BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index f50533169cc3..70221aafc35c 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -500,6 +500,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type) return "xdp"; case BPF_DYNPTR_TYPE_SKB_META: return "skb_meta"; + case BPF_DYNPTR_TYPE_FILE: + return "file"; case BPF_DYNPTR_TYPE_INVALID: return ""; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f60cfab95230..cd48ead852a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -692,6 +692,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_XDP; case DYNPTR_TYPE_SKB_META: return BPF_DYNPTR_TYPE_SKB_META; + case DYNPTR_TYPE_FILE: + return BPF_DYNPTR_TYPE_FILE; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -710,6 +712,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_XDP; case BPF_DYNPTR_TYPE_SKB_META: return DYNPTR_TYPE_SKB_META; + case BPF_DYNPTR_TYPE_FILE: + return DYNPTR_TYPE_FILE; default: return 0; } @@ -717,7 +721,7 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) static bool dynptr_type_refcounted(enum bpf_dynptr_type type) { - return type == BPF_DYNPTR_TYPE_RINGBUF; + return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE; } static void __mark_dynptr_reg(struct bpf_reg_state *reg, @@ -12291,6 +12295,8 @@ enum special_kfunc_type { KF_bpf_res_spin_unlock, KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, + KF_bpf_dynptr_from_file, + KF_bpf_dynptr_file_discard, KF___bpf_trap, KF_bpf_task_work_schedule_signal, KF_bpf_task_work_schedule_resume, @@ -12363,6 +12369,8 @@ BTF_ID(func, bpf_res_spin_lock) BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, bpf_dynptr_from_file) +BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) BTF_ID(func, bpf_task_work_schedule_signal) BTF_ID(func, bpf_task_work_schedule_resume) @@ -13326,6 +13334,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_XDP; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { dynptr_arg_type |= DYNPTR_TYPE_SKB_META; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; @@ -14006,12 +14019,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ if (meta.release_regno) { - err = release_reference(env, regs[meta.release_regno].ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - return err; + struct bpf_reg_state *reg = ®s[meta.release_regno]; + + if (meta.initialized_dynptr.ref_obj_id) { + err = unmark_stack_slots_dynptr(env, reg); + } else { + err = release_reference(env, reg->ref_obj_id); + if (err) + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, meta.func_id); } + if (err) + return err; } if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || -- cgit v1.2.3 From 2c52e8943a437af6093d8b0f0920f1764f0e5f64 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 26 Oct 2025 20:38:52 +0000 Subject: bpf: dispatch to sleepable file dynptr File dynptr reads may sleep when the requested folios are not in the page cache. To avoid sleeping in non-sleepable contexts while still supporting valid sleepable use, given that dynptrs are non-sleepable by default, enable sleeping only when bpf_dynptr_from_file() is invoked from a sleepable context. This change: * Introduces a sleepable constructor: bpf_dynptr_from_file_sleepable() * Override non-sleepable constructor with sleepable if it's always called in sleepable context Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251026203853.135105-10-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ kernel/bpf/helpers.c | 5 +++++ kernel/bpf/verifier.c | 10 +++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 14f800773997..a47d67db3be5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -670,6 +670,9 @@ static inline bool bpf_map_has_internal_structs(struct bpf_map *map) void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); +int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, + struct bpf_dynptr *ptr__uninit); + extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 99a7def0b978..930e132f440f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4336,6 +4336,11 @@ __bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dy return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit); } +int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) +{ + return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit); +} + __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 61589be91c65..542e23fb19c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3124,7 +3124,8 @@ struct bpf_kfunc_btf_tab { u32 nr_descs; }; -static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc); +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, + int insn_idx); static int kfunc_desc_cmp_by_id_off(const void *a, const void *b) { @@ -21869,7 +21870,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) } /* replace a generic kfunc with a specialized version if necessary */ -static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc) +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx) { struct bpf_prog *prog = env->prog; bool seen_direct_write; @@ -21904,6 +21905,9 @@ static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) { if (bpf_lsm_has_d_inode_locked(prog)) addr = (unsigned long)bpf_remove_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + if (!env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_dynptr_from_file_sleepable; } set_imm: @@ -21963,7 +21967,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } - err = specialize_kfunc(env, desc); + err = specialize_kfunc(env, desc, insn_idx); if (err) return err; -- cgit v1.2.3 From b94d45b6bbb42571ec225d3be0e7457c8765a5b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 Oct 2025 09:56:38 +0100 Subject: seqlock: Allow KASAN to fail optimizing Some KASAN builds are failing to properly optimize this code -- luckily we don't care about core quality for KASAN builds, so just exclude it. Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Closes: https://lore.kernel.org/oe-kbuild-all/202510251641.idrNXhv5-lkp@intel.com/ --- include/linux/seqlock.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index b7bcc4111e90..a8a8661839b6 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -1234,11 +1234,14 @@ static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst) extern void __scoped_seqlock_invalid_target(void); -#if defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000 +#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || defined(CONFIG_KASAN) /* * For some reason some GCC-8 architectures (nios2, alpha) have trouble * determining that the ss_done state is impossible in __scoped_seqlock_next() * below. + * + * Similarly KASAN is known to confuse compilers enough to break this. But we + * don't care about code quality for KASAN builds anyway. */ static inline void __scoped_seqlock_bug(void) { } #else -- cgit v1.2.3 From 90a18c512884adb49ddc2fb30e94594169aae808 Mon Sep 17 00:00:00 2001 From: Antonio Borneo Date: Thu, 23 Oct 2025 15:26:50 +0200 Subject: pinctrl: pinconf-generic: Handle string values for generic properties Allow a generic pinconf property to specify its argument as one of the strings in a match list. Convert the matching string to an integer value using the index in the list, then keep using this value in the generic pinconf code. Signed-off-by: Antonio Borneo Signed-off-by: Linus Walleij --- drivers/pinctrl/pinconf-generic.c | 57 ++++++++++++++++++++++++--------- include/linux/pinctrl/pinconf-generic.h | 11 +++++-- 2 files changed, 50 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index e3d10bbcdaeb..72906d71ae1a 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -65,11 +65,12 @@ static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev, int i; for (i = 0; i < nitems; i++) { + const struct pin_config_item *item = &items[i]; unsigned long config; int ret; /* We want to check out this parameter */ - config = pinconf_to_config_packed(items[i].param, 0); + config = pinconf_to_config_packed(item->param, 0); if (gname) ret = pin_config_group_get(dev_name(pctldev->dev), gname, &config); @@ -86,15 +87,22 @@ static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev, if (*print_sep) seq_puts(s, ", "); *print_sep = 1; - seq_puts(s, items[i].display); + seq_puts(s, item->display); /* Print unit if available */ - if (items[i].has_arg) { + if (item->has_arg) { u32 val = pinconf_to_config_argument(config); - if (items[i].format) - seq_printf(s, " (%u %s)", val, items[i].format); + if (item->format) + seq_printf(s, " (%u %s)", val, item->format); else seq_printf(s, " (0x%x)", val); + + if (item->values && item->num_values) { + if (val < item->num_values) + seq_printf(s, " \"%s\"", item->values[val]); + else + seq_puts(s, " \"(unknown)\""); + } } } } @@ -205,10 +213,10 @@ static const struct pinconf_generic_params dt_params[] = { * @ncfg. @ncfg is updated to reflect the number of entries after parsing. @cfg * needs to have enough memory allocated to hold all possible entries. */ -static void parse_dt_cfg(struct device_node *np, - const struct pinconf_generic_params *params, - unsigned int count, unsigned long *cfg, - unsigned int *ncfg) +static int parse_dt_cfg(struct device_node *np, + const struct pinconf_generic_params *params, + unsigned int count, unsigned long *cfg, + unsigned int *ncfg) { int i; @@ -217,7 +225,19 @@ static void parse_dt_cfg(struct device_node *np, int ret; const struct pinconf_generic_params *par = ¶ms[i]; - ret = of_property_read_u32(np, par->property, &val); + if (par->values && par->num_values) { + ret = fwnode_property_match_property_string(of_fwnode_handle(np), + par->property, + par->values, par->num_values); + if (ret == -ENOENT) + return ret; + if (ret >= 0) { + val = ret; + ret = 0; + } + } else { + ret = of_property_read_u32(np, par->property, &val); + } /* property not found */ if (ret == -EINVAL) @@ -231,6 +251,8 @@ static void parse_dt_cfg(struct device_node *np, cfg[*ncfg] = pinconf_to_config_packed(par->param, val); (*ncfg)++; } + + return 0; } /** @@ -323,13 +345,16 @@ int pinconf_generic_parse_dt_config(struct device_node *np, if (!cfg) return -ENOMEM; - parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg); + ret = parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg); + if (ret) + return ret; if (pctldev && pctldev->desc->num_custom_params && - pctldev->desc->custom_params) - parse_dt_cfg(np, pctldev->desc->custom_params, - pctldev->desc->num_custom_params, cfg, &ncfg); - - ret = 0; + pctldev->desc->custom_params) { + ret = parse_dt_cfg(np, pctldev->desc->custom_params, + pctldev->desc->num_custom_params, cfg, &ncfg); + if (ret) + return ret; + } /* no configs found at all */ if (ncfg == 0) { diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index d9245ecec71d..f82add5d3302 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -181,21 +181,28 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param return PIN_CONF_PACKED(param, argument); } -#define PCONFDUMP(a, b, c, d) { \ - .param = a, .display = b, .format = c, .has_arg = d \ +#define PCONFDUMP_WITH_VALUES(a, b, c, d, e, f) { \ + .param = a, .display = b, .format = c, .has_arg = d, \ + .values = e, .num_values = f \ } +#define PCONFDUMP(a, b, c, d) PCONFDUMP_WITH_VALUES(a, b, c, d, NULL, 0) + struct pin_config_item { const enum pin_config_param param; const char * const display; const char * const format; bool has_arg; + const char * const *values; + size_t num_values; }; struct pinconf_generic_params { const char * const property; enum pin_config_param param; u32 default_value; + const char * const *values; + size_t num_values; }; int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev, -- cgit v1.2.3 From 55c7f5ef904fc2dcc7ef5945c5efb0cd60b46d32 Mon Sep 17 00:00:00 2001 From: Antonio Borneo Date: Thu, 23 Oct 2025 15:26:51 +0200 Subject: pinctrl: pinconf-generic: Add properties 'skew-delay-{in,out}put-ps' Add the properties 'skew-delay-input-ps' and 'skew-delay-output-ps' to the generic parameters used for parsing DT files. This allows to specify the independent skew delay value for the two directions. This enables drivers that use the generic pin configuration to get the value passed through these new properties. Signed-off-by: Antonio Borneo Signed-off-by: Linus Walleij --- drivers/pinctrl/pinconf-generic.c | 4 ++++ include/linux/pinctrl/pinconf-generic.h | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index 72906d71ae1a..366775841c63 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -54,6 +54,8 @@ static const struct pin_config_item conf_items[] = { PCONFDUMP(PIN_CONFIG_SLEEP_HARDWARE_STATE, "sleep hardware state", NULL, false), PCONFDUMP(PIN_CONFIG_SLEW_RATE, "slew rate", NULL, true), PCONFDUMP(PIN_CONFIG_SKEW_DELAY, "skew delay", NULL, true), + PCONFDUMP(PIN_CONFIG_SKEW_DELAY_INPUT_PS, "input skew delay", "ps", true), + PCONFDUMP(PIN_CONFIG_SKEW_DELAY_OUTPUT_PS, "output skew delay", "ps", true), }; static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev, @@ -198,6 +200,8 @@ static const struct pinconf_generic_params dt_params[] = { { "sleep-hardware-state", PIN_CONFIG_SLEEP_HARDWARE_STATE, 0 }, { "slew-rate", PIN_CONFIG_SLEW_RATE, 0 }, { "skew-delay", PIN_CONFIG_SKEW_DELAY, 0 }, + { "skew-delay-input-ps", PIN_CONFIG_SKEW_DELAY_INPUT_PS, 0 }, + { "skew-delay-output-ps", PIN_CONFIG_SKEW_DELAY_OUTPUT_PS, 0 }, }; /** diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index f82add5d3302..1be4032071c2 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -112,6 +112,12 @@ struct pinctrl_map; * or latch delay (on outputs) this parameter (in a custom format) * specifies the clock skew or latch delay. It typically controls how * many double inverters are put in front of the line. + * @PIN_CONFIG_SKEW_DELAY_INPUT_PS: if the pin has independent values for the + * programmable skew rate (on inputs) and latch delay (on outputs), then + * this parameter specifies the clock skew only. The argument is in ps. + * @PIN_CONFIG_SKEW_DELAY_OUPUT_PS: if the pin has independent values for the + * programmable skew rate (on inputs) and latch delay (on outputs), then + * this parameter specifies the latch delay only. The argument is in ps. * @PIN_CONFIG_SLEEP_HARDWARE_STATE: indicate this is sleep related state. * @PIN_CONFIG_SLEW_RATE: if the pin can select slew rate, the argument to * this parameter (on a custom format) tells the driver which alternative @@ -147,6 +153,8 @@ enum pin_config_param { PIN_CONFIG_PERSIST_STATE, PIN_CONFIG_POWER_SOURCE, PIN_CONFIG_SKEW_DELAY, + PIN_CONFIG_SKEW_DELAY_INPUT_PS, + PIN_CONFIG_SKEW_DELAY_OUTPUT_PS, PIN_CONFIG_SLEEP_HARDWARE_STATE, PIN_CONFIG_SLEW_RATE, PIN_CONFIG_END = 0x7F, -- cgit v1.2.3 From 7718f2a8b87af7363d60819ac0ac0da8b2f8ff00 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 23 Oct 2025 12:16:57 +0300 Subject: net/mlx5: Add software system image GUID infrastructure Replace direct hardware system image GUID usage with a new software system image GUID function that supports variable-length identifiers. Key changes: - Add mlx5_query_nic_sw_system_image_guid() function with length parameter. - Update all callsites to use the new function and buffer/length approach. - Modify mapping contexts to use byte arrays instead of u64 keys. - Update devcom matching to support variable-length keys. - Change mlx5_same_hw_devs() to use buffer comparison instead of u64. This refactoring prepares the infrastructure for balance ID support, which requires extending the system image GUID with additional data. The change maintains backward compatibility while enabling future enhancements. Signed-off-by: Mark Bloch Reviewed-by: Shay Drori Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1761211020-925651-3-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/dev.c | 12 ++++++---- .../net/ethernet/mellanox/mlx5/core/en/devlink.c | 7 ++---- .../net/ethernet/mellanox/mlx5/core/en/mapping.c | 13 +++++++---- .../net/ethernet/mellanox/mlx5/core/en/mapping.h | 3 ++- .../ethernet/mellanox/mlx5/core/en/tc/int_port.c | 8 ++++--- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 11 +++++---- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 26 +++++++++++++--------- .../ethernet/mellanox/mlx5/core/esw/devlink_port.c | 6 +---- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 8 ++++--- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 4 +++- .../net/ethernet/mellanox/mlx5/core/lib/devcom.h | 2 ++ .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/vport.c | 15 +++++++++++++ include/linux/mlx5/driver.h | 3 +++ 14 files changed, 80 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c index 891bbbbfbbf1..64c04f52990f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c @@ -564,10 +564,14 @@ int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev) bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev) { - u64 fsystem_guid, psystem_guid; + u8 fsystem_guid[MLX5_SW_IMAGE_GUID_MAX_BYTES]; + u8 psystem_guid[MLX5_SW_IMAGE_GUID_MAX_BYTES]; + u8 flen; + u8 plen; - fsystem_guid = mlx5_query_nic_system_image_guid(dev); - psystem_guid = mlx5_query_nic_system_image_guid(peer_dev); + mlx5_query_nic_sw_system_image_guid(dev, fsystem_guid, &flen); + mlx5_query_nic_sw_system_image_guid(peer_dev, psystem_guid, &plen); - return (fsystem_guid && psystem_guid && fsystem_guid == psystem_guid); + return plen && flen && flen == plen && + !memcmp(fsystem_guid, psystem_guid, flen); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c index 0b1ac6e5c890..8818f65d1fbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c @@ -40,11 +40,8 @@ void mlx5e_destroy_devlink(struct mlx5e_dev *mlx5e_dev) static void mlx5e_devlink_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_id *ppid) { - u64 parent_id; - - parent_id = mlx5_query_nic_system_image_guid(dev); - ppid->id_len = sizeof(parent_id); - memcpy(ppid->id, &parent_id, sizeof(parent_id)); + BUILD_BUG_ON(MLX5_SW_IMAGE_GUID_MAX_BYTES > MAX_PHYS_ITEM_ID_LEN); + mlx5_query_nic_sw_system_image_guid(dev, ppid->id, &ppid->id_len); } int mlx5e_devlink_port_register(struct mlx5e_dev *mlx5e_dev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c index 4e72ca8070e2..1de18c7e96ec 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "mapping.h" @@ -24,7 +25,8 @@ struct mapping_ctx { struct delayed_work dwork; struct list_head pending_list; spinlock_t pending_list_lock; /* Guards pending list */ - u64 id; + u8 id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; + u8 id_len; u8 type; struct list_head list; refcount_t refcount; @@ -220,13 +222,15 @@ mapping_create(size_t data_size, u32 max_id, bool delayed_removal) } struct mapping_ctx * -mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delayed_removal) +mapping_create_for_id(u8 *id, u8 id_len, u8 type, size_t data_size, u32 max_id, + bool delayed_removal) { struct mapping_ctx *ctx; mutex_lock(&shared_ctx_lock); list_for_each_entry(ctx, &shared_ctx_list, list) { - if (ctx->id == id && ctx->type == type) { + if (ctx->type == type && ctx->id_len == id_len && + !memcmp(id, ctx->id, id_len)) { if (refcount_inc_not_zero(&ctx->refcount)) goto unlock; break; @@ -237,7 +241,8 @@ mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delaye if (IS_ERR(ctx)) goto unlock; - ctx->id = id; + memcpy(ctx->id, id, id_len); + ctx->id_len = id_len; ctx->type = type; list_add(&ctx->list, &shared_ctx_list); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h index 4e2119f0f4c1..e86a103d58b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h @@ -27,6 +27,7 @@ void mapping_destroy(struct mapping_ctx *ctx); /* adds mapping with an id or get an existing mapping with the same id */ struct mapping_ctx * -mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delayed_removal); +mapping_create_for_id(u8 *id, u8 id_len, u8 type, size_t data_size, u32 max_id, + bool delayed_removal); #endif /* __MLX5_MAPPING_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c index 896f718483c3..991f47050643 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c @@ -307,7 +307,8 @@ mlx5e_tc_int_port_init(struct mlx5e_priv *priv) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5e_tc_int_port_priv *int_port_priv; - u64 mapping_id; + u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; + u8 id_len; if (!mlx5e_tc_int_port_supported(esw)) return NULL; @@ -316,9 +317,10 @@ mlx5e_tc_int_port_init(struct mlx5e_priv *priv) if (!int_port_priv) return NULL; - mapping_id = mlx5_query_nic_system_image_guid(priv->mdev); + mlx5_query_nic_sw_system_image_guid(priv->mdev, mapping_id, &id_len); - int_port_priv->metadata_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_INT_PORT, + int_port_priv->metadata_mapping = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_INT_PORT, sizeof(u32) * 2, (1 << ESW_VPORT_BITS) - 1, true); if (IS_ERR(int_port_priv->metadata_mapping)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 870d12364f99..fc0e57403d25 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -2287,9 +2287,10 @@ mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, enum mlx5_flow_namespace_type ns_type, struct mlx5e_post_act *post_act) { + u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; struct mlx5_tc_ct_priv *ct_priv; struct mlx5_core_dev *dev; - u64 mapping_id; + u8 id_len; int err; dev = priv->mdev; @@ -2301,16 +2302,18 @@ mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, if (!ct_priv) goto err_alloc; - mapping_id = mlx5_query_nic_system_image_guid(dev); + mlx5_query_nic_sw_system_image_guid(dev, mapping_id, &id_len); - ct_priv->zone_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_ZONE, + ct_priv->zone_mapping = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_ZONE, sizeof(u16), 0, true); if (IS_ERR(ct_priv->zone_mapping)) { err = PTR_ERR(ct_priv->zone_mapping); goto err_mapping_zone; } - ct_priv->labels_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_LABELS, + ct_priv->labels_mapping = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_LABELS, sizeof(u32) * 4, 0, true); if (IS_ERR(ct_priv->labels_mapping)) { err = PTR_ERR(ct_priv->labels_mapping); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 54ccfb4e6c4e..a8773b2342c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -5233,10 +5233,11 @@ static void mlx5e_tc_nic_destroy_miss_table(struct mlx5e_priv *priv) int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { struct mlx5e_tc_table *tc = mlx5e_fs_get_tc(priv->fs); + u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; struct mlx5_core_dev *dev = priv->mdev; struct mapping_ctx *chains_mapping; struct mlx5_chains_attr attr = {}; - u64 mapping_id; + u8 id_len; int err; mlx5e_mod_hdr_tbl_init(&tc->mod_hdr); @@ -5252,11 +5253,13 @@ int mlx5e_tc_nic_init(struct mlx5e_priv *priv) lockdep_set_class(&tc->ht.mutex, &tc_ht_lock_key); lockdep_init_map(&tc->ht.run_work.lockdep_map, "tc_ht_wq_key", &tc_ht_wq_key, 0); - mapping_id = mlx5_query_nic_system_image_guid(dev); + mlx5_query_nic_sw_system_image_guid(dev, mapping_id, &id_len); - chains_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_CHAIN, + chains_mapping = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_CHAIN, sizeof(struct mlx5_mapped_obj), - MLX5E_TC_TABLE_CHAIN_TAG_MASK, true); + MLX5E_TC_TABLE_CHAIN_TAG_MASK, + true); if (IS_ERR(chains_mapping)) { err = PTR_ERR(chains_mapping); @@ -5387,14 +5390,15 @@ void mlx5e_tc_ht_cleanup(struct rhashtable *tc_ht) int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) { const size_t sz_enc_opts = sizeof(struct tunnel_match_enc_opts); + u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; struct mlx5_devcom_match_attr attr = {}; struct netdev_phys_item_id ppid; struct mlx5e_rep_priv *rpriv; struct mapping_ctx *mapping; struct mlx5_eswitch *esw; struct mlx5e_priv *priv; - u64 mapping_id; int err = 0; + u8 id_len; rpriv = container_of(uplink_priv, struct mlx5e_rep_priv, uplink_priv); priv = netdev_priv(rpriv->netdev); @@ -5412,9 +5416,9 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) uplink_priv->tc_psample = mlx5e_tc_sample_init(esw, uplink_priv->post_act); - mapping_id = mlx5_query_nic_system_image_guid(esw->dev); + mlx5_query_nic_sw_system_image_guid(esw->dev, mapping_id, &id_len); - mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_TUNNEL, + mapping = mapping_create_for_id(mapping_id, id_len, MAPPING_TYPE_TUNNEL, sizeof(struct tunnel_match_key), TUNNEL_INFO_BITS_MASK, true); @@ -5427,8 +5431,10 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) /* Two last values are reserved for stack devices slow path table mark * and bridge ingress push mark. */ - mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_TUNNEL_ENC_OPTS, - sz_enc_opts, ENC_OPTS_BITS_MASK - 2, true); + mapping = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_TUNNEL_ENC_OPTS, + sz_enc_opts, ENC_OPTS_BITS_MASK - 2, + true); if (IS_ERR(mapping)) { err = PTR_ERR(mapping); goto err_enc_opts_mapping; @@ -5449,7 +5455,7 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) err = netif_get_port_parent_id(priv->netdev, &ppid, false); if (!err) { - memcpy(&attr.key.val, &ppid.id, sizeof(attr.key.val)); + memcpy(&attr.key.buf, &ppid.id, ppid.id_len); attr.flags = MLX5_DEVCOM_MATCH_FLAGS_NS; attr.net = mlx5_core_net(esw->dev); mlx5_esw_offloads_devcom_init(esw, &attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index cf88a106d80d..89a58dee50b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -7,11 +7,7 @@ static void mlx5_esw_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_id *ppid) { - u64 parent_id; - - parent_id = mlx5_query_nic_system_image_guid(dev); - ppid->id_len = sizeof(parent_id); - memcpy(ppid->id, &parent_id, sizeof(parent_id)); + mlx5_query_nic_sw_system_image_guid(dev, ppid->id, &ppid->id_len); } static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_num) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 34749814f19b..9735a75732cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3557,10 +3557,11 @@ bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 cont int esw_offloads_enable(struct mlx5_eswitch *esw) { + u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; struct mapping_ctx *reg_c0_obj_pool; struct mlx5_vport *vport; unsigned long i; - u64 mapping_id; + u8 id_len; int err; mutex_init(&esw->offloads.termtbl_mutex); @@ -3582,9 +3583,10 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) if (err) goto err_vport_metadata; - mapping_id = mlx5_query_nic_system_image_guid(esw->dev); + mlx5_query_nic_sw_system_image_guid(esw->dev, mapping_id, &id_len); - reg_c0_obj_pool = mapping_create_for_id(mapping_id, MAPPING_TYPE_CHAIN, + reg_c0_obj_pool = mapping_create_for_id(mapping_id, id_len, + MAPPING_TYPE_CHAIN, sizeof(struct mlx5_mapped_obj), ESW_REG_C0_USER_DATA_METADATA_MASK, true); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 3db0387bf6dc..1ac933cd8f02 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -1418,10 +1418,12 @@ static void mlx5_lag_unregister_hca_devcom_comp(struct mlx5_core_dev *dev) static int mlx5_lag_register_hca_devcom_comp(struct mlx5_core_dev *dev) { struct mlx5_devcom_match_attr attr = { - .key.val = mlx5_query_nic_system_image_guid(dev), .flags = MLX5_DEVCOM_MATCH_FLAGS_NS, .net = mlx5_core_net(dev), }; + u8 len __always_unused; + + mlx5_query_nic_sw_system_image_guid(dev, attr.key.buf, &len); /* This component is use to sync adding core_dev to lag_dev and to sync * changes of mlx5_adev_devices between LAG layer and other layers. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h index 609c85f47917..91e5ae529d5c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h @@ -10,8 +10,10 @@ enum mlx5_devom_match_flags { MLX5_DEVCOM_MATCH_FLAGS_NS = BIT(0), }; +#define MLX5_DEVCOM_MATCH_KEY_MAX 32 union mlx5_devcom_match_key { u64 val; + u8 buf[MLX5_DEVCOM_MATCH_KEY_MAX]; }; struct mlx5_devcom_match_attr { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 082259b56816..acef7d0ffa09 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -444,6 +444,8 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev); void mlx5_uninit_one_light(struct mlx5_core_dev *dev); void mlx5_unload_one_light(struct mlx5_core_dev *dev); +void mlx5_query_nic_sw_system_image_guid(struct mlx5_core_dev *mdev, u8 *buf, + u8 *len); int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap, u16 vport, u16 opmod); #define mlx5_vport_get_other_func_general_cap(dev, vport, out) \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 2ed2e530b07d..4224e2750865 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -1190,6 +1190,21 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev) } EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid); +void mlx5_query_nic_sw_system_image_guid(struct mlx5_core_dev *mdev, u8 *buf, + u8 *len) +{ + u64 fw_system_image_guid; + + *len = 0; + + fw_system_image_guid = mlx5_query_nic_system_image_guid(mdev); + if (!fw_system_image_guid) + return; + + memcpy(buf, &fw_system_image_guid, sizeof(fw_system_image_guid)); + *len += sizeof(fw_system_image_guid); +} + static bool mlx5_vport_use_vhca_id_as_func_id(struct mlx5_core_dev *dev, u16 vport_num, u16 *vhca_id) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5405ca1038f9..dcf262aa9ea6 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1379,4 +1379,7 @@ static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) { return devlink_net(priv_to_devlink(dev)); } + +#define MLX5_SW_IMAGE_GUID_MAX_BYTES 8 + #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 20d78ead947783b039b02ca4b8c551b4d1894759 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 23 Oct 2025 12:17:00 +0300 Subject: net/mlx5: Add balance ID support for LAG multiplane groups Implement balance ID support for multiplane LAG configurations. This feature enables per-multiplane group load balancing by extending the software system image GUID with a balance ID component. Key implementations: - Enable lag_per_mp_group capability when supported by hardware. - Append load_balance_id to software system image GUID when conditions are met. - Increase MLX5_SW_IMAGE_GUID_MAX_BYTES from 8 to 9 to accommodate the extra byte. The balance ID is appended to the system image GUID only when both load_balance_id and lag_per_mp_group capabilities are available, ensuring backward compatibility while enabling enhanced LAG functionality. This enhancement allows for more granular load balancing control in complex multi-plane LAG deployments, improving network performance and flexibility. Signed-off-by: Mark Bloch Reviewed-by: Moshe Shemesh Reviewed-by: Shay Drori Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1761211020-925651-6-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 5 +++++ drivers/net/ethernet/mellanox/mlx5/core/vport.c | 4 ++++ include/linux/mlx5/driver.h | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 563267acf386..c904696cbc3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -575,6 +575,11 @@ static int handle_hca_cap_2(struct mlx5_core_dev *dev, void *set_ctx) do_set = true; } + if (MLX5_CAP_GEN_2_MAX(dev, lag_per_mp_group)) { + MLX5_SET(cmd_hca_cap_2, set_hca_cap, lag_per_mp_group, 1); + do_set = true; + } + /* some FW versions that support querying MLX5_CAP_GENERAL_2 * capabilities but don't support setting them. * Skip unnecessary update to hca_cap_2 when no changes were introduced diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 4224e2750865..992873536c1b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -1203,6 +1203,10 @@ void mlx5_query_nic_sw_system_image_guid(struct mlx5_core_dev *mdev, u8 *buf, memcpy(buf, &fw_system_image_guid, sizeof(fw_system_image_guid)); *len += sizeof(fw_system_image_guid); + + if (MLX5_CAP_GEN_2(mdev, load_balance_id) && + MLX5_CAP_GEN_2(mdev, lag_per_mp_group)) + buf[(*len)++] = MLX5_CAP_GEN_2(mdev, load_balance_id); } static bool mlx5_vport_use_vhca_id_as_func_id(struct mlx5_core_dev *dev, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index dcf262aa9ea6..046396269ccf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1380,6 +1380,6 @@ static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) return devlink_net(priv_to_devlink(dev)); } -#define MLX5_SW_IMAGE_GUID_MAX_BYTES 8 +#define MLX5_SW_IMAGE_GUID_MAX_BYTES 9 #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 12a1c9353c47c0fb3464eba2d78cdf649dee1cf7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 27 Oct 2025 09:27:32 +0900 Subject: block: fix op_is_zone_mgmt() to handle REQ_OP_ZONE_RESET_ALL REQ_OP_ZONE_RESET_ALL is a zone management request. Fix op_is_zone_mgmt() to return true for that operation, like it already does for REQ_OP_ZONE_RESET. While no problems were reported without this fix, this change allows strengthening checks in various block device drivers (scsi sd, virtioblk, DM) where op_is_zone_mgmt() is used to verify that a zone management command is not being issued to a regular block device. Fixes: 6c1b1da58f8c ("block: add zone open, close and finish operations") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8e8d1cc8b06c..d8ba743a89b7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -478,6 +478,7 @@ static inline bool op_is_zone_mgmt(enum req_op op) { switch (op & REQ_OP_MASK) { case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: -- cgit v1.2.3 From 19de03b312d69a7e9bacb51c806c6e3f4207376c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 27 Oct 2025 09:27:33 +0900 Subject: block: make REQ_OP_ZONE_OPEN a write operation A REQ_OP_OPEN_ZONE request changes the condition of a sequential zone of a zoned block device to the explicitly open condition (BLK_ZONE_COND_EXP_OPEN). As such, it should be considered a write operation. Change this operation code to be an odd number to reflect this. The following operation numbers are changed to keep the numbering compact. No problems were reported without this change as this operation has no data. However, this unifies the zone operation to reflect that they modify the device state and also allows strengthening checks in the block layer, e.g. checking if this operation is not issued against a read-only device. Fixes: 6c1b1da58f8c ("block: add zone open, close and finish operations") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d8ba743a89b7..44c30183ecc3 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -341,15 +341,15 @@ enum req_op { /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = (__force blk_opf_t)9, /* Open a zone */ - REQ_OP_ZONE_OPEN = (__force blk_opf_t)10, + REQ_OP_ZONE_OPEN = (__force blk_opf_t)11, /* Close a zone */ - REQ_OP_ZONE_CLOSE = (__force blk_opf_t)11, + REQ_OP_ZONE_CLOSE = (__force blk_opf_t)13, /* Transition a zone to full */ - REQ_OP_ZONE_FINISH = (__force blk_opf_t)13, + REQ_OP_ZONE_FINISH = (__force blk_opf_t)15, /* reset a zone write pointer */ - REQ_OP_ZONE_RESET = (__force blk_opf_t)15, + REQ_OP_ZONE_RESET = (__force blk_opf_t)17, /* reset all the zone present on the device */ - REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)17, + REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)19, /* Driver private requests */ REQ_OP_DRV_IN = (__force blk_opf_t)34, -- cgit v1.2.3 From baeb66fbd4201d1c4325074e78b1f557dff89b5b Mon Sep 17 00:00:00 2001 From: Jimmy Hu Date: Thu, 23 Oct 2025 05:49:45 +0000 Subject: usb: gadget: udc: fix use-after-free in usb_gadget_state_work A race condition during gadget teardown can lead to a use-after-free in usb_gadget_state_work(), as reported by KASAN: BUG: KASAN: invalid-access in sysfs_notify+0x2c/0xd0 Workqueue: events usb_gadget_state_work The fundamental race occurs because a concurrent event (e.g., an interrupt) can call usb_gadget_set_state() and schedule gadget->work at any time during the cleanup process in usb_del_gadget(). Commit 399a45e5237c ("usb: gadget: core: flush gadget workqueue after device removal") attempted to fix this by moving flush_work() to after device_del(). However, this does not fully solve the race, as a new work item can still be scheduled *after* flush_work() completes but before the gadget's memory is freed, leading to the same use-after-free. This patch fixes the race condition robustly by introducing a 'teardown' flag and a 'state_lock' spinlock to the usb_gadget struct. The flag is set during cleanup in usb_del_gadget() *before* calling flush_work() to prevent any new work from being scheduled once cleanup has commenced. The scheduling site, usb_gadget_set_state(), now checks this flag under the lock before queueing the work, thus safely closing the race window. Fixes: 5702f75375aa9 ("usb: gadget: udc-core: move sysfs_notify() to a workqueue") Cc: stable Signed-off-by: Jimmy Hu Link: https://patch.msgid.link/20251023054945.233861-1-hhhuuu@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 17 ++++++++++++++++- include/linux/usb/gadget.h | 5 +++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 694653761c44..8dbe79bdc0f9 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1126,8 +1126,13 @@ static void usb_gadget_state_work(struct work_struct *work) void usb_gadget_set_state(struct usb_gadget *gadget, enum usb_device_state state) { + unsigned long flags; + + spin_lock_irqsave(&gadget->state_lock, flags); gadget->state = state; - schedule_work(&gadget->work); + if (!gadget->teardown) + schedule_work(&gadget->work); + spin_unlock_irqrestore(&gadget->state_lock, flags); trace_usb_gadget_set_state(gadget, 0); } EXPORT_SYMBOL_GPL(usb_gadget_set_state); @@ -1361,6 +1366,8 @@ static void usb_udc_nop_release(struct device *dev) void usb_initialize_gadget(struct device *parent, struct usb_gadget *gadget, void (*release)(struct device *dev)) { + spin_lock_init(&gadget->state_lock); + gadget->teardown = false; INIT_WORK(&gadget->work, usb_gadget_state_work); gadget->dev.parent = parent; @@ -1535,6 +1542,7 @@ EXPORT_SYMBOL_GPL(usb_add_gadget_udc); void usb_del_gadget(struct usb_gadget *gadget) { struct usb_udc *udc = gadget->udc; + unsigned long flags; if (!udc) return; @@ -1548,6 +1556,13 @@ void usb_del_gadget(struct usb_gadget *gadget) kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE); sysfs_remove_link(&udc->dev.kobj, "gadget"); device_del(&gadget->dev); + /* + * Set the teardown flag before flushing the work to prevent new work + * from being scheduled while we are cleaning up. + */ + spin_lock_irqsave(&gadget->state_lock, flags); + gadget->teardown = true; + spin_unlock_irqrestore(&gadget->state_lock, flags); flush_work(&gadget->work); ida_free(&gadget_id_numbers, gadget->id_number); cancel_work_sync(&udc->vbus_work); diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 3aaf19e77558..8285b19a25e0 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -376,6 +376,9 @@ struct usb_gadget_ops { * can handle. The UDC must support this and all slower speeds and lower * number of lanes. * @state: the state we are now (attached, suspended, configured, etc) + * @state_lock: Spinlock protecting the `state` and `teardown` members. + * @teardown: True if the device is undergoing teardown, used to prevent + * new work from being scheduled during cleanup. * @name: Identifies the controller hardware type. Used in diagnostics * and sometimes configuration. * @dev: Driver model state for this abstract device. @@ -451,6 +454,8 @@ struct usb_gadget { enum usb_ssp_rate max_ssp_rate; enum usb_device_state state; + spinlock_t state_lock; + bool teardown; const char *name; struct device dev; unsigned isoch_delay; -- cgit v1.2.3 From f0f7a3f542c1698edb69075f25a3f846207facba Mon Sep 17 00:00:00 2001 From: Qiu Wenbo Date: Tue, 28 Oct 2025 14:30:09 +0800 Subject: platform/x86: int3472: Fix double free of GPIO device during unregister MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit regulator_unregister() already frees the associated GPIO device. On ThinkPad X9 (Lunar Lake), this causes a double free issue that leads to random failures when other drivers (typically Intel THC) attempt to allocate interrupts. The root cause is that the reference count of the pinctrl_intel_platform module unexpectedly drops to zero when this driver defers its probe. This behavior can also be reproduced by unloading the module directly. Fix the issue by removing the redundant release of the GPIO device during regulator unregistration. Cc: stable@vger.kernel.org Fixes: 1e5d088a52c2 ("platform/x86: int3472: Stop using devm_gpiod_get()") Signed-off-by: Qiu Wenbo Reviewed-by: Andy Shevchenko Reviewed-by: Sakari Ailus Reviewed-by: Hans de Goede Reviewed-by: Daniel Scally Link: https://patch.msgid.link/20251028063009.289414-1-qiuwenbo@gnome.org Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/int3472/clk_and_regulator.c | 5 +---- include/linux/platform_data/x86/int3472.h | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/intel/int3472/clk_and_regulator.c b/drivers/platform/x86/intel/int3472/clk_and_regulator.c index 476ec24d3702..9e052b164a1a 100644 --- a/drivers/platform/x86/intel/int3472/clk_and_regulator.c +++ b/drivers/platform/x86/intel/int3472/clk_and_regulator.c @@ -245,15 +245,12 @@ int skl_int3472_register_regulator(struct int3472_discrete_device *int3472, if (IS_ERR(regulator->rdev)) return PTR_ERR(regulator->rdev); - int3472->regulators[int3472->n_regulator_gpios].ena_gpio = gpio; int3472->n_regulator_gpios++; return 0; } void skl_int3472_unregister_regulator(struct int3472_discrete_device *int3472) { - for (int i = 0; i < int3472->n_regulator_gpios; i++) { + for (int i = 0; i < int3472->n_regulator_gpios; i++) regulator_unregister(int3472->regulators[i].rdev); - gpiod_put(int3472->regulators[i].ena_gpio); - } } diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index 1571e9157fa5..b1b837583d54 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -100,7 +100,6 @@ struct int3472_gpio_regulator { struct regulator_consumer_supply supply_map[GPIO_REGULATOR_SUPPLY_MAP_COUNT * 2]; char supply_name_upper[GPIO_SUPPLY_NAME_LENGTH]; char regulator_name[GPIO_REGULATOR_NAME_LENGTH]; - struct gpio_desc *ena_gpio; struct regulator_dev *rdev; struct regulator_desc rdesc; }; -- cgit v1.2.3 From 48cbf50531d8eca15b8a811717afdebb8677de9b Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Fri, 24 Oct 2025 16:23:44 +0800 Subject: regmap: irq: Correct documentation of wake_invert flag Per commit 9442490a0286 ("regmap: irq: Support wake IRQ mask inversion") the wake_invert flag is to support enable register, so cleared bits are wake disabled. Fixes: 68622bdfefb9 ("regmap: irq: document mask/wake_invert flags") Cc: stable@vger.kernel.org Signed-off-by: Shawn Guo Link: https://patch.msgid.link/20251024082344.2188895-1-shawnguo2@yeah.net Signed-off-by: Mark Brown --- include/linux/regmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 4e1ac1fbcec4..55343795644b 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1643,7 +1643,7 @@ struct regmap_irq_chip_data; * @status_invert: Inverted status register: cleared bits are active interrupts. * @status_is_level: Status register is actuall signal level: Xor status * register with previous value to get active interrupts. - * @wake_invert: Inverted wake register: cleared bits are wake enabled. + * @wake_invert: Inverted wake register: cleared bits are wake disabled. * @type_in_mask: Use the mask registers for controlling irq type. Use this if * the hardware provides separate bits for rising/falling edge * or low/high level interrupts and they should be combined into -- cgit v1.2.3 From d1e6d2773898c7a1c19e12619d303920d32a9cd0 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 10 Oct 2025 17:38:13 +0200 Subject: rcu: Add a small-width RCU watching counter debug option A later commit will reduce the size of the RCU watching counter to free up some bits for another purpose. Paul suggested adding a config option to test the extreme case where the counter is reduced to its minimum usable width for rcutorture to poke at, so do that. Make it only configurable under RCU_EXPERT. While at it, add a comment to explain the layout of context_tracking->state. Link: http://lore.kernel.org/r/4c2cb573-168f-4806-b1d9-164e8276e66a@paulmck-laptop Suggested-by: Paul E. McKenney Signed-off-by: Valentin Schneider Reviewed-by: Paul E. McKenney Reviewed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker --- include/linux/context_tracking_state.h | 44 ++++++++++++++++++++++++++++------ kernel/rcu/Kconfig.debug | 15 ++++++++++++ 2 files changed, 52 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 7b8433d5a8ef..0b81248aa03e 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -18,12 +18,6 @@ enum ctx_state { CT_STATE_MAX = 4, }; -/* Odd value for watching, else even. */ -#define CT_RCU_WATCHING CT_STATE_MAX - -#define CT_STATE_MASK (CT_STATE_MAX - 1) -#define CT_RCU_WATCHING_MASK (~CT_STATE_MASK) - struct context_tracking { #ifdef CONFIG_CONTEXT_TRACKING_USER /* @@ -44,9 +38,45 @@ struct context_tracking { #endif }; +/* + * We cram two different things within the same atomic variable: + * + * CT_RCU_WATCHING_START CT_STATE_START + * | | + * v v + * MSB [ RCU watching counter ][ context_state ] LSB + * ^ ^ + * | | + * CT_RCU_WATCHING_END CT_STATE_END + * + * Bits are used from the LSB upwards, so unused bits (if any) will always be in + * upper bits of the variable. + */ #ifdef CONFIG_CONTEXT_TRACKING +#define CT_SIZE (sizeof(((struct context_tracking *)0)->state) * BITS_PER_BYTE) + +#define CT_STATE_WIDTH bits_per(CT_STATE_MAX - 1) +#define CT_STATE_START 0 +#define CT_STATE_END (CT_STATE_START + CT_STATE_WIDTH - 1) + +#define CT_RCU_WATCHING_MAX_WIDTH (CT_SIZE - CT_STATE_WIDTH) +#define CT_RCU_WATCHING_WIDTH (IS_ENABLED(CONFIG_RCU_DYNTICKS_TORTURE) ? 2 : CT_RCU_WATCHING_MAX_WIDTH) +#define CT_RCU_WATCHING_START (CT_STATE_END + 1) +#define CT_RCU_WATCHING_END (CT_RCU_WATCHING_START + CT_RCU_WATCHING_WIDTH - 1) +#define CT_RCU_WATCHING BIT(CT_RCU_WATCHING_START) + +#define CT_STATE_MASK GENMASK(CT_STATE_END, CT_STATE_START) +#define CT_RCU_WATCHING_MASK GENMASK(CT_RCU_WATCHING_END, CT_RCU_WATCHING_START) + +#define CT_UNUSED_WIDTH (CT_RCU_WATCHING_MAX_WIDTH - CT_RCU_WATCHING_WIDTH) + +static_assert(CT_STATE_WIDTH + + CT_RCU_WATCHING_WIDTH + + CT_UNUSED_WIDTH == + CT_SIZE); + DECLARE_PER_CPU(struct context_tracking, context_tracking); -#endif +#endif /* CONFIG_CONTEXT_TRACKING */ #ifdef CONFIG_CONTEXT_TRACKING_USER static __always_inline int __ct_state(void) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 12e4c64ebae1..625d75392647 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -213,4 +213,19 @@ config RCU_STRICT_GRACE_PERIOD when looking for certain types of RCU usage bugs, for example, too-short RCU read-side critical sections. + +config RCU_DYNTICKS_TORTURE + bool "Minimize RCU dynticks counter size" + depends on RCU_EXPERT && !COMPILE_TEST + default n + help + This option sets the width of the dynticks counter to its + minimum usable value. This minimum width greatly increases + the probability of flushing out bugs involving counter wrap, + but it also increases the probability of extending grace period + durations. This Kconfig option should therefore be avoided in + production due to the consequent increased probability of OOMs. + + This has no value for production and is only for testing. + endmenu # "RCU Debugging" -- cgit v1.2.3 From bcce8c74f1ce1e2731ac0261287897e3768767d8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 24 Oct 2025 15:46:21 -0700 Subject: PCI: Enable host bridge emulation for PCI_DOMAINS_GENERIC platforms The ability to emulate a host bridge is useful not only for hardware PCI controllers like CONFIG_VMD, or virtual PCI controllers like CONFIG_PCI_HYPERV, but also for test and development scenarios like CONFIG_SAMPLES_DEVSEC [1]. One stumbling block for defining CONFIG_SAMPLES_DEVSEC, a sample implementation of a platform TSM for PCI Device Security, is the need to accommodate PCI_DOMAINS_GENERIC architectures alongside x86 [2]. In support of supplementing the existing CONFIG_PCI_BRIDGE_EMUL infrastructure for host bridges: * Introduce pci_bus_find_emul_domain_nr() as a common way to find a free PCI domain number whether that is to reuse the existing dynamic allocation code in the !ACPI case, or to assign an unused domain above the last ACPI segment. * Convert pci-hyperv to the new allocator so that the PCI core can unconditionally assume that bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET is the dynamically allocated case. A follow on patch can also convert vmd to the new scheme. Currently vmd is limited to CONFIG_PCI_DOMAINS_GENERIC=n (x86) so, unlike pci-hyperv, it does not immediately conflict with this new pci_bus_find_emul_domain_nr() mechanism. Link: http://lore.kernel.org/174107249038.1288555.12362100502109498455.stgit@dwillia2-xfh.jf.intel.com [1] Reported-by: Suzuki K Poulose Closes: http://lore.kernel.org/20250311144601.145736-3-suzuki.poulose@arm.com [2] Signed-off-by: Dan Williams Signed-off-by: Bjorn Helgaas Tested-by: Suzuki K Poulose Tested-by: Michael Kelley Reviewed-by: Michael Kelley Cc: Lorenzo Pieralisi Cc: Manivannan Sadhasivam Cc: Rob Herring Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Wei Liu Cc: Dexuan Cui Link: https://patch.msgid.link/20251024224622.1470555-2-dan.j.williams@intel.com --- drivers/pci/controller/pci-hyperv.c | 62 ++++++------------------------------- drivers/pci/pci.c | 24 +++++++++++++- drivers/pci/probe.c | 8 ++++- include/linux/pci.h | 7 +++++ 4 files changed, 46 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 146b43981b27..1e237d3538f9 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3696,48 +3696,6 @@ static int hv_send_resources_released(struct hv_device *hdev) return 0; } -#define HVPCI_DOM_MAP_SIZE (64 * 1024) -static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); - -/* - * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 - * as invalid for passthrough PCI devices of this driver. - */ -#define HVPCI_DOM_INVALID 0 - -/** - * hv_get_dom_num() - Get a valid PCI domain number - * Check if the PCI domain number is in use, and return another number if - * it is in use. - * - * @dom: Requested domain number - * - * return: domain number on success, HVPCI_DOM_INVALID on failure - */ -static u16 hv_get_dom_num(u16 dom) -{ - unsigned int i; - - if (test_and_set_bit(dom, hvpci_dom_map) == 0) - return dom; - - for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { - if (test_and_set_bit(i, hvpci_dom_map) == 0) - return i; - } - - return HVPCI_DOM_INVALID; -} - -/** - * hv_put_dom_num() - Mark the PCI domain number as free - * @dom: Domain number to be freed - */ -static void hv_put_dom_num(u16 dom) -{ - clear_bit(dom, hvpci_dom_map); -} - /** * hv_pci_probe() - New VMBus channel probe, for a root PCI bus * @hdev: VMBus's tracking struct for this root PCI bus @@ -3750,9 +3708,9 @@ static int hv_pci_probe(struct hv_device *hdev, { struct pci_host_bridge *bridge; struct hv_pcibus_device *hbus; - u16 dom_req, dom; + int ret, dom; + u16 dom_req; char *name; - int ret; bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); if (!bridge) @@ -3779,11 +3737,14 @@ static int hv_pci_probe(struct hv_device *hdev, * PCI bus (which is actually emulated by the hypervisor) is domain 0. * (2) There will be no overlap between domains (after fixing possible * collisions) in the same VM. + * + * Because Gen1 VMs use domain 0, don't allow picking domain 0 here, + * even if bytes 4 and 5 of the instance GUID are both zero. For wider + * userspace compatibility, limit the domain ID to a 16-bit value. */ dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; - dom = hv_get_dom_num(dom_req); - - if (dom == HVPCI_DOM_INVALID) { + dom = pci_bus_find_emul_domain_nr(dom_req, 1, U16_MAX); + if (dom < 0) { dev_err(&hdev->device, "Unable to use dom# 0x%x or other numbers", dom_req); ret = -EINVAL; @@ -3917,7 +3878,7 @@ close: destroy_wq: destroy_workqueue(hbus->wq); free_dom: - hv_put_dom_num(hbus->bridge->domain_nr); + pci_bus_release_emul_domain_nr(hbus->bridge->domain_nr); free_bus: kfree(hbus); return ret; @@ -4042,8 +4003,6 @@ static void hv_pci_remove(struct hv_device *hdev) irq_domain_remove(hbus->irq_domain); irq_domain_free_fwnode(hbus->fwnode); - hv_put_dom_num(hbus->bridge->domain_nr); - kfree(hbus); } @@ -4217,9 +4176,6 @@ static int __init init_hv_pci_drv(void) if (ret) return ret; - /* Set the invalid domain number's bit, so it will not be used */ - set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); - /* Initialize PCI block r/w interface */ hvpci_block_ops.read_block = hv_read_config_block; hvpci_block_ops.write_block = hv_write_config_block; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b14dd064006c..f4ccb948e59d 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6656,9 +6656,31 @@ static void pci_no_domains(void) #endif } +#ifdef CONFIG_PCI_DOMAINS +static DEFINE_IDA(pci_domain_nr_dynamic_ida); + +/** + * pci_bus_find_emul_domain_nr() - allocate a PCI domain number per constraints + * @hint: desired domain, 0 if any ID in the range of @min to @max is acceptable + * @min: minimum allowable domain + * @max: maximum allowable domain, no IDs higher than INT_MAX will be returned + */ +int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max) +{ + return ida_alloc_range(&pci_domain_nr_dynamic_ida, max(hint, min), max, + GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(pci_bus_find_emul_domain_nr); + +void pci_bus_release_emul_domain_nr(int domain_nr) +{ + ida_free(&pci_domain_nr_dynamic_ida, domain_nr); +} +EXPORT_SYMBOL_GPL(pci_bus_release_emul_domain_nr); +#endif + #ifdef CONFIG_PCI_DOMAINS_GENERIC static DEFINE_IDA(pci_domain_nr_static_ida); -static DEFINE_IDA(pci_domain_nr_dynamic_ida); static void of_pci_reserve_static_domain_nr(void) { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index c83e75a0ec12..c3f6e2714440 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -657,6 +657,11 @@ static void pci_release_host_bridge_dev(struct device *dev) pci_free_resource_list(&bridge->windows); pci_free_resource_list(&bridge->dma_ranges); + + /* Host bridges only have domain_nr set in the emulation case */ + if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET) + pci_bus_release_emul_domain_nr(bridge->domain_nr); + kfree(bridge); } @@ -1137,7 +1142,8 @@ unregister: device_del(&bridge->dev); free: #ifdef CONFIG_PCI_DOMAINS_GENERIC - pci_bus_release_domain_nr(parent, bus->domain_nr); + if (bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET) + pci_bus_release_domain_nr(parent, bus->domain_nr); #endif if (bus_registered) put_device(&bus->dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..1ef1535802b0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1956,10 +1956,17 @@ DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) */ #ifdef CONFIG_PCI_DOMAINS extern int pci_domains_supported; +int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max); +void pci_bus_release_emul_domain_nr(int domain_nr); #else enum { pci_domains_supported = 0 }; static inline int pci_domain_nr(struct pci_bus *bus) { return 0; } static inline int pci_proc_domain(struct pci_bus *bus) { return 0; } +static inline int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max) +{ + return 0; +} +static inline void pci_bus_release_emul_domain_nr(int domain_nr) { } #endif /* CONFIG_PCI_DOMAINS */ /* -- cgit v1.2.3 From a1f3058930745d2b938b6b4f5bd9630dc74b26b7 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Fri, 10 Oct 2025 16:16:59 +0800 Subject: fbcon: Set fb_display[i]->mode to NULL when the mode is released Recently, we discovered the following issue through syzkaller: BUG: KASAN: slab-use-after-free in fb_mode_is_equal+0x285/0x2f0 Read of size 4 at addr ff11000001b3c69c by task syz.xxx ... Call Trace: dump_stack_lvl+0xab/0xe0 print_address_description.constprop.0+0x2c/0x390 print_report+0xb9/0x280 kasan_report+0xb8/0xf0 fb_mode_is_equal+0x285/0x2f0 fbcon_mode_deleted+0x129/0x180 fb_set_var+0xe7f/0x11d0 do_fb_ioctl+0x6a0/0x750 fb_ioctl+0xe0/0x140 __x64_sys_ioctl+0x193/0x210 do_syscall_64+0x5f/0x9c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Based on experimentation and analysis, during framebuffer unregistration, only the memory of fb_info->modelist is freed, without setting the corresponding fb_display[i]->mode to NULL for the freed modes. This leads to UAF issues during subsequent accesses. Here's an example of reproduction steps: 1. With /dev/fb0 already registered in the system, load a kernel module to register a new device /dev/fb1; 2. Set fb1's mode to the global fb_display[] array (via FBIOPUT_CON2FBMAP); 3. Switch console from fb to VGA (to allow normal rmmod of the ko); 4. Unload the kernel module, at this point fb1's modelist is freed, leaving a wild pointer in fb_display[]; 5. Trigger the bug via system calls through fb0 attempting to delete a mode from fb0. Add a check in do_unregister_framebuffer(): if the mode to be freed exists in fb_display[], set the corresponding mode pointer to NULL. Signed-off-by: Quanmin Yan Reviewed-by: Thomas Zimmermann Signed-off-by: Helge Deller Cc: stable@vger.kernel.org --- drivers/video/fbdev/core/fbcon.c | 19 +++++++++++++++++++ drivers/video/fbdev/core/fbmem.c | 1 + include/linux/fbcon.h | 2 ++ 3 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 96cc9b389246..9bd3c3814b5c 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2810,6 +2810,25 @@ int fbcon_mode_deleted(struct fb_info *info, return found; } +static void fbcon_delete_mode(struct fb_videomode *m) +{ + struct fbcon_display *p; + + for (int i = first_fb_vc; i <= last_fb_vc; i++) { + p = &fb_display[i]; + if (p->mode == m) + p->mode = NULL; + } +} + +void fbcon_delete_modelist(struct list_head *head) +{ + struct fb_modelist *modelist; + + list_for_each_entry(modelist, head, list) + fbcon_delete_mode(&modelist->mode); +} + #ifdef CONFIG_VT_HW_CONSOLE_BINDING static void fbcon_unbind(void) { diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 53f1719b1ae1..eff757ebbed1 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -544,6 +544,7 @@ static void do_unregister_framebuffer(struct fb_info *fb_info) fb_info->pixmap.addr = NULL; } + fbcon_delete_modelist(&fb_info->modelist); fb_destroy_modelist(&fb_info->modelist); registered_fb[fb_info->node] = NULL; num_registered_fb--; diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h index 81f0e698acbf..f206370060e1 100644 --- a/include/linux/fbcon.h +++ b/include/linux/fbcon.h @@ -18,6 +18,7 @@ void fbcon_suspended(struct fb_info *info); void fbcon_resumed(struct fb_info *info); int fbcon_mode_deleted(struct fb_info *info, struct fb_videomode *mode); +void fbcon_delete_modelist(struct list_head *head); void fbcon_new_modelist(struct fb_info *info); void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps); @@ -38,6 +39,7 @@ static inline void fbcon_suspended(struct fb_info *info) {} static inline void fbcon_resumed(struct fb_info *info) {} static inline int fbcon_mode_deleted(struct fb_info *info, struct fb_videomode *mode) { return 0; } +static inline void fbcon_delete_modelist(struct list_head *head) {} static inline void fbcon_new_modelist(struct fb_info *info) {} static inline void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps) {} -- cgit v1.2.3 From 32e0f607ac6a2bb5d144540897535fd01be77586 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:24 -0400 Subject: tracing: Add trace_seq_pop() and seq_buf_pop() In order to allow an interface to remove an added character from the trace_seq and seq_buf descriptors, add helper functions trace_seq_pop() and seq_buf_pop(). Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231148.594898736@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 17 +++++++++++++++++ include/linux/trace_seq.h | 13 +++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 52791e070506..9f2839e73f8a 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -149,6 +149,23 @@ static inline void seq_buf_commit(struct seq_buf *s, int num) } } +/** + * seq_buf_pop - pop off the last written character + * @s: the seq_buf handle + * + * Removes the last written character to the seq_buf @s. + * + * Returns the last character or -1 if it is empty. + */ +static inline int seq_buf_pop(struct seq_buf *s) +{ + if (!s->len) + return -1; + + s->len--; + return (unsigned int)s->buffer[s->len]; +} + extern __printf(2, 3) int seq_buf_printf(struct seq_buf *s, const char *fmt, ...); extern __printf(2, 0) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 557780fe1c77..4a0b8c172d27 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -80,6 +80,19 @@ static inline bool trace_seq_has_overflowed(struct trace_seq *s) return s->full || seq_buf_has_overflowed(&s->seq); } +/** + * trace_seq_pop - pop off the last written character + * @s: trace sequence descriptor + * + * Removes the last written character to the trace_seq @s. + * + * Returns the last character or -1 if it is empty. + */ +static inline int trace_seq_pop(struct trace_seq *s) +{ + return seq_buf_pop(&s->seq); +} + /* * Currently only defined when tracing is enabled. */ -- cgit v1.2.3 From c72568c21b97dbc48d02b769f4eec6667ad13d5a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Oct 2025 09:12:40 +0000 Subject: net: rps: softnet_data reorg to make enqueue_to_backlog() fast enqueue_to_backlog() is showing up in kernel profiles on hosts with many cores, when RFS/RPS is used. The following softnet_data fields need to be updated: - input_queue_tail - input_pkt_queue (next, prev, qlen, lock) - backlog.state (if input_pkt_queue was empty) Unfortunately they are currenly using two cache lines: /* --- cacheline 3 boundary (192 bytes) --- */ call_single_data_t csd __attribute__((__aligned__(64))); /* 0xc0 0x20 */ struct softnet_data * rps_ipi_next; /* 0xe0 0x8 */ unsigned int cpu; /* 0xe8 0x4 */ unsigned int input_queue_tail; /* 0xec 0x4 */ struct sk_buff_head input_pkt_queue; /* 0xf0 0x18 */ /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ struct napi_struct backlog __attribute__((__aligned__(8))); /* 0x108 0x1f0 */ Add one ____cacheline_aligned_in_smp to make sure they now are using a single cache line. Also, because napi_struct has written fields, make @state its first field. We want to make sure that cpus adding packets to sd->input_pkt_queue are not slowing down cpus processing their backlog because of false sharing. After this patch new layout is: /* --- cacheline 5 boundary (320 bytes) --- */ long int pad[3] __attribute__((__aligned__(64))); /* 0x140 0x18 */ unsigned int input_queue_tail; /* 0x158 0x4 */ /* XXX 4 bytes hole, try to pack */ struct sk_buff_head input_pkt_queue; /* 0x160 0x18 */ struct napi_struct backlog __attribute__((__aligned__(8))); /* 0x178 0x1f0 */ Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251024091240.3292546-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7f5aad5cc9a1..9c1e5042c5e7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -377,6 +377,8 @@ struct napi_config { * Structure for NAPI scheduling similar to tasklet but with weighting */ struct napi_struct { + /* This field should be first or softnet_data.backlog needs tweaks. */ + unsigned long state; /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means * whoever atomically sets that bit can add this napi_struct @@ -385,7 +387,6 @@ struct napi_struct { */ struct list_head poll_list; - unsigned long state; int weight; u32 defer_hard_irqs_count; int (*poll)(struct napi_struct *, int); @@ -3529,9 +3530,17 @@ struct softnet_data { call_single_data_t csd ____cacheline_aligned_in_smp; struct softnet_data *rps_ipi_next; unsigned int cpu; + + /* We force a cacheline alignment from here, to hold together + * input_queue_tail, input_pkt_queue and backlog.state. + * We add holes so that backlog.state is the last field + * of this cache line. + */ + long pad[3] ____cacheline_aligned_in_smp; unsigned int input_queue_tail; #endif struct sk_buff_head input_pkt_queue; + struct napi_struct backlog; struct numa_drop_counters drop_counters; -- cgit v1.2.3 From f74ee32963f1b74865fe679e2475450434fea51c Mon Sep 17 00:00:00 2001 From: Qinxin Xia Date: Tue, 28 Oct 2025 20:09:00 +0800 Subject: tools/dma: move dma_map_benchmark from selftests to tools/dma dma_map_benchmark is a standalone developer tool rather than an automated selftest. It has no pass/fail criteria, expects manual invocation, and is built as a normal userspace binary. Move it to tools/dma/ and add a minimal Makefile. Suggested-by: Marek Szyprowski Suggested-by: Barry Song Signed-off-by: Qinxin Xia Acked-by: Barry Song Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251028120900.2265511-3-xiaqinxin@huawei.com --- include/linux/map_benchmark.h | 32 ------ include/uapi/linux/map_benchmark.h | 35 +++++++ kernel/dma/map_benchmark.c | 2 +- tools/Makefile | 13 +-- tools/dma/.gitignore | 3 + tools/dma/Makefile | 55 ++++++++++ tools/dma/config | 1 + tools/dma/dma_map_benchmark.c | 127 +++++++++++++++++++++++ tools/testing/selftests/dma/Makefile | 7 -- tools/testing/selftests/dma/config | 1 - tools/testing/selftests/dma/dma_map_benchmark.c | 128 ------------------------ 11 files changed, 229 insertions(+), 175 deletions(-) delete mode 100644 include/linux/map_benchmark.h create mode 100644 include/uapi/linux/map_benchmark.h create mode 100644 tools/dma/.gitignore create mode 100644 tools/dma/Makefile create mode 100644 tools/dma/config create mode 100644 tools/dma/dma_map_benchmark.c delete mode 100644 tools/testing/selftests/dma/Makefile delete mode 100644 tools/testing/selftests/dma/config delete mode 100644 tools/testing/selftests/dma/dma_map_benchmark.c (limited to 'include/linux') diff --git a/include/linux/map_benchmark.h b/include/linux/map_benchmark.h deleted file mode 100644 index 48e2ff95332f..000000000000 --- a/include/linux/map_benchmark.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2022 HiSilicon Limited. - */ - -#ifndef _KERNEL_DMA_BENCHMARK_H -#define _KERNEL_DMA_BENCHMARK_H - -#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) -#define DMA_MAP_MAX_THREADS 1024 -#define DMA_MAP_MAX_SECONDS 300 -#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) - -#define DMA_MAP_BIDIRECTIONAL 0 -#define DMA_MAP_TO_DEVICE 1 -#define DMA_MAP_FROM_DEVICE 2 - -struct map_benchmark { - __u64 avg_map_100ns; /* average map latency in 100ns */ - __u64 map_stddev; /* standard deviation of map latency */ - __u64 avg_unmap_100ns; /* as above */ - __u64 unmap_stddev; - __u32 threads; /* how many threads will do map/unmap in parallel */ - __u32 seconds; /* how long the test will last */ - __s32 node; /* which numa node this benchmark will run on */ - __u32 dma_bits; /* DMA addressing capability */ - __u32 dma_dir; /* DMA data direction */ - __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ - __u8 expansion[76]; /* For future use */ -}; -#endif /* _KERNEL_DMA_BENCHMARK_H */ diff --git a/include/uapi/linux/map_benchmark.h b/include/uapi/linux/map_benchmark.h new file mode 100644 index 000000000000..c2d91088a40d --- /dev/null +++ b/include/uapi/linux/map_benchmark.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Copyright (C) 2022-2025 HiSilicon Limited. + */ + +#ifndef _UAPI_DMA_BENCHMARK_H +#define _UAPI_DMA_BENCHMARK_H + +#include + +#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) +#define DMA_MAP_MAX_THREADS 1024 +#define DMA_MAP_MAX_SECONDS 300 +#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) + +#define DMA_MAP_BIDIRECTIONAL 0 +#define DMA_MAP_TO_DEVICE 1 +#define DMA_MAP_FROM_DEVICE 2 + +struct map_benchmark { + __u64 avg_map_100ns; /* average map latency in 100ns */ + __u64 map_stddev; /* standard deviation of map latency */ + __u64 avg_unmap_100ns; /* as above */ + __u64 unmap_stddev; + __u32 threads; /* how many threads will do map/unmap in parallel */ + __u32 seconds; /* how long the test will last */ + __s32 node; /* which numa node this benchmark will run on */ + __u32 dma_bits; /* DMA addressing capability */ + __u32 dma_dir; /* DMA data direction */ + __u32 dma_trans_ns; /* time for DMA transmission in ns */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ +}; + +#endif /* _UAPI_DMA_BENCHMARK_H */ diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index cc19a3efea89..794041a39e65 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -11,13 +11,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include struct map_benchmark_data { struct map_benchmark bparam; diff --git a/tools/Makefile b/tools/Makefile index c31cbbd12c45..cb40961a740f 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -14,6 +14,7 @@ help: @echo ' counter - counter tools' @echo ' cpupower - a tool for all things x86 CPU power' @echo ' debugging - tools for debugging' + @echo ' dma - tools for DMA mapping' @echo ' firewire - the userspace part of nosy, an IEEE-1394 traffic sniffer' @echo ' firmware - Firmware tools' @echo ' freefall - laptop accelerometer program for disk protection' @@ -69,7 +70,7 @@ acpi: FORCE cpupower: FORCE $(call descend,power/$@) -counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi firmware debugging tracing: FORCE +counter dma firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi firmware debugging tracing: FORCE $(call descend,$@) bpf/%: FORCE @@ -122,7 +123,7 @@ kvm_stat: FORCE ynl: FORCE $(call descend,net/ynl) -all: acpi counter cpupower gpio hv firewire \ +all: acpi counter cpupower dma gpio hv firewire \ perf selftests bootconfig spi turbostat usb \ virtio mm bpf x86_energy_perf_policy \ tmon freefall iio objtool kvm_stat wmi \ @@ -134,7 +135,7 @@ acpi_install: cpupower_install: $(call descend,power/$(@:_install=),install) -counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install debugging_install tracing_install: +counter_install dma_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install debugging_install tracing_install: $(call descend,$(@:_install=),install) selftests_install: @@ -164,7 +165,7 @@ kvm_stat_install: ynl_install: $(call descend,net/$(@:_install=),install) -install: acpi_install counter_install cpupower_install gpio_install \ +install: acpi_install counter_install cpupower_install dma_install gpio_install \ hv_install firewire_install iio_install \ perf_install selftests_install turbostat_install usb_install \ virtio_install mm_install bpf_install x86_energy_perf_policy_install \ @@ -178,7 +179,7 @@ acpi_clean: cpupower_clean: $(call descend,power/cpupower,clean) -counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean firmware_clean debugging_clean tracing_clean: +counter_clean dma_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean firmware_clean debugging_clean tracing_clean: $(call descend,$(@:_clean=),clean) libapi_clean: @@ -224,7 +225,7 @@ build_clean: ynl_clean: $(call descend,net/$(@:_clean=),clean) -clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ +clean: acpi_clean counter_clean cpupower_clean dma_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \ mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ diff --git a/tools/dma/.gitignore b/tools/dma/.gitignore new file mode 100644 index 000000000000..94b68cf4147b --- /dev/null +++ b/tools/dma/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +dma_map_benchmark +include/linux/map_benchmark.h diff --git a/tools/dma/Makefile b/tools/dma/Makefile new file mode 100644 index 000000000000..e4abf37bf020 --- /dev/null +++ b/tools/dma/Makefile @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0 +include ../scripts/Makefile.include + +bindir ?= /usr/bin + +# This will work when dma is built in tools env. where srctree +# isn't set and when invoked from selftests build, where srctree +# is set to ".". building_out_of_srctree is undefined for in srctree +# builds +ifndef building_out_of_srctree +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +endif + +# Do not use make's built-in rules +# (this improves performance and avoids hard-to-debug behaviour); +MAKEFLAGS += -r + +override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include + +ALL_TARGETS := dma_map_benchmark +ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) + +all: $(ALL_PROGRAMS) + +export srctree OUTPUT CC LD CFLAGS +include $(srctree)/tools/build/Makefile.include + +# +# We need the following to be outside of kernel tree +# +$(OUTPUT)include/linux/map_benchmark.h: ../../include/uapi/linux/map_benchmark.h + mkdir -p $(OUTPUT)include/linux 2>&1 || true + ln -sf $(CURDIR)/../../include/uapi/linux/map_benchmark.h $@ + +prepare: $(OUTPUT)include/linux/map_benchmark.h + +FORCE: + +DMA_MAP_BENCHMARK = dma_map_benchmark +$(DMA_MAP_BENCHMARK): prepare FORCE + $(CC) $(CFLAGS) $(DMA_MAP_BENCHMARK).c -o $(DMA_MAP_BENCHMARK) + +clean: + rm -f $(ALL_PROGRAMS) + rm -rf $(OUTPUT)include + find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete + +install: $(ALL_PROGRAMS) + install -d -m 755 $(DESTDIR)$(bindir); \ + for program in $(ALL_PROGRAMS); do \ + install $$program $(DESTDIR)$(bindir); \ + done + +.PHONY: all install clean prepare FORCE diff --git a/tools/dma/config b/tools/dma/config new file mode 100644 index 000000000000..6102ee3c43cd --- /dev/null +++ b/tools/dma/config @@ -0,0 +1 @@ +CONFIG_DMA_MAP_BENCHMARK=y diff --git a/tools/dma/dma_map_benchmark.c b/tools/dma/dma_map_benchmark.c new file mode 100644 index 000000000000..5474a450863c --- /dev/null +++ b/tools/dma/dma_map_benchmark.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 HiSilicon Limited. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define NSEC_PER_MSEC 1000000L + +static char *directions[] = { + "BIDIRECTIONAL", + "TO_DEVICE", + "FROM_DEVICE", +}; + +int main(int argc, char **argv) +{ + struct map_benchmark map; + int fd, opt; + /* default single thread, run 20 seconds on NUMA_NO_NODE */ + int threads = 1, seconds = 20, node = -1; + /* default dma mask 32bit, bidirectional DMA */ + int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL; + /* default granule 1 PAGESIZE */ + int granule = 1; + + int cmd = DMA_MAP_BENCHMARK; + + while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) { + switch (opt) { + case 't': + threads = atoi(optarg); + break; + case 's': + seconds = atoi(optarg); + break; + case 'n': + node = atoi(optarg); + break; + case 'b': + bits = atoi(optarg); + break; + case 'd': + dir = atoi(optarg); + break; + case 'x': + xdelay = atoi(optarg); + break; + case 'g': + granule = atoi(optarg); + break; + default: + return -1; + } + } + + if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) { + fprintf(stderr, "invalid number of threads, must be in 1-%d\n", + DMA_MAP_MAX_THREADS); + exit(1); + } + + if (seconds <= 0 || seconds > DMA_MAP_MAX_SECONDS) { + fprintf(stderr, "invalid number of seconds, must be in 1-%d\n", + DMA_MAP_MAX_SECONDS); + exit(1); + } + + if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) { + fprintf(stderr, "invalid transmit delay, must be in 0-%ld\n", + DMA_MAP_MAX_TRANS_DELAY); + exit(1); + } + + /* suppose the mininum DMA zone is 1MB in the world */ + if (bits < 20 || bits > 64) { + fprintf(stderr, "invalid dma mask bit, must be in 20-64\n"); + exit(1); + } + + if (dir != DMA_MAP_BIDIRECTIONAL && dir != DMA_MAP_TO_DEVICE && + dir != DMA_MAP_FROM_DEVICE) { + fprintf(stderr, "invalid dma direction\n"); + exit(1); + } + + if (granule < 1 || granule > 1024) { + fprintf(stderr, "invalid granule size\n"); + exit(1); + } + + fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR); + if (fd == -1) { + perror("open"); + exit(1); + } + + memset(&map, 0, sizeof(map)); + map.seconds = seconds; + map.threads = threads; + map.node = node; + map.dma_bits = bits; + map.dma_dir = dir; + map.dma_trans_ns = xdelay; + map.granule = granule; + + if (ioctl(fd, cmd, &map)) { + perror("ioctl"); + exit(1); + } + + printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n", + threads, seconds, node, dir[directions], granule); + printf("average map latency(us):%.1f standard deviation:%.1f\n", + map.avg_map_100ns/10.0, map.map_stddev/10.0); + printf("average unmap latency(us):%.1f standard deviation:%.1f\n", + map.avg_unmap_100ns/10.0, map.unmap_stddev/10.0); + + return 0; +} diff --git a/tools/testing/selftests/dma/Makefile b/tools/testing/selftests/dma/Makefile deleted file mode 100644 index cd8c5ece1cba..000000000000 --- a/tools/testing/selftests/dma/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -CFLAGS += -I../../../../usr/include/ -CFLAGS += -I../../../../include/ - -TEST_GEN_PROGS := dma_map_benchmark - -include ../lib.mk diff --git a/tools/testing/selftests/dma/config b/tools/testing/selftests/dma/config deleted file mode 100644 index 6102ee3c43cd..000000000000 --- a/tools/testing/selftests/dma/config +++ /dev/null @@ -1 +0,0 @@ -CONFIG_DMA_MAP_BENCHMARK=y diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c deleted file mode 100644 index b12f1f9babf8..000000000000 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2020 HiSilicon Limited. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define NSEC_PER_MSEC 1000000L - -static char *directions[] = { - "BIDIRECTIONAL", - "TO_DEVICE", - "FROM_DEVICE", -}; - -int main(int argc, char **argv) -{ - struct map_benchmark map; - int fd, opt; - /* default single thread, run 20 seconds on NUMA_NO_NODE */ - int threads = 1, seconds = 20, node = -1; - /* default dma mask 32bit, bidirectional DMA */ - int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL; - /* default granule 1 PAGESIZE */ - int granule = 1; - - int cmd = DMA_MAP_BENCHMARK; - - while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) { - switch (opt) { - case 't': - threads = atoi(optarg); - break; - case 's': - seconds = atoi(optarg); - break; - case 'n': - node = atoi(optarg); - break; - case 'b': - bits = atoi(optarg); - break; - case 'd': - dir = atoi(optarg); - break; - case 'x': - xdelay = atoi(optarg); - break; - case 'g': - granule = atoi(optarg); - break; - default: - return -1; - } - } - - if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) { - fprintf(stderr, "invalid number of threads, must be in 1-%d\n", - DMA_MAP_MAX_THREADS); - exit(1); - } - - if (seconds <= 0 || seconds > DMA_MAP_MAX_SECONDS) { - fprintf(stderr, "invalid number of seconds, must be in 1-%d\n", - DMA_MAP_MAX_SECONDS); - exit(1); - } - - if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) { - fprintf(stderr, "invalid transmit delay, must be in 0-%ld\n", - DMA_MAP_MAX_TRANS_DELAY); - exit(1); - } - - /* suppose the mininum DMA zone is 1MB in the world */ - if (bits < 20 || bits > 64) { - fprintf(stderr, "invalid dma mask bit, must be in 20-64\n"); - exit(1); - } - - if (dir != DMA_MAP_BIDIRECTIONAL && dir != DMA_MAP_TO_DEVICE && - dir != DMA_MAP_FROM_DEVICE) { - fprintf(stderr, "invalid dma direction\n"); - exit(1); - } - - if (granule < 1 || granule > 1024) { - fprintf(stderr, "invalid granule size\n"); - exit(1); - } - - fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR); - if (fd == -1) { - perror("open"); - exit(1); - } - - memset(&map, 0, sizeof(map)); - map.seconds = seconds; - map.threads = threads; - map.node = node; - map.dma_bits = bits; - map.dma_dir = dir; - map.dma_trans_ns = xdelay; - map.granule = granule; - - if (ioctl(fd, cmd, &map)) { - perror("ioctl"); - exit(1); - } - - printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n", - threads, seconds, node, dir[directions], granule); - printf("average map latency(us):%.1f standard deviation:%.1f\n", - map.avg_map_100ns/10.0, map.map_stddev/10.0); - printf("average unmap latency(us):%.1f standard deviation:%.1f\n", - map.avg_unmap_100ns/10.0, map.unmap_stddev/10.0); - - return 0; -} -- cgit v1.2.3 From 23ee8a2563a0f24cf4964685ced23c32be444ab8 Mon Sep 17 00:00:00 2001 From: Qinxin Xia Date: Tue, 28 Oct 2025 20:08:59 +0800 Subject: dma-mapping: benchmark: Restore padding to ensure uABI remained consistent The padding field in the structure was previously reserved to maintain a stable interface for potential new fields, ensuring compatibility with user-space shared data structures. However,it was accidentally removed by tiantao in a prior commit, which may lead to incompatibility between user space and the kernel. This patch reinstates the padding to restore the original structure layout and preserve compatibility. Fixes: 8ddde07a3d28 ("dma-mapping: benchmark: extract a common header file for map_benchmark definition") Cc: stable@vger.kernel.org Acked-by: Barry Song Signed-off-by: Qinxin Xia Reported-by: Barry Song Closes: https://lore.kernel.org/lkml/CAGsJ_4waiZ2+NBJG+SCnbNk+nQ_ZF13_Q5FHJqZyxyJTcEop2A@mail.gmail.com/ Reviewed-by: Jonathan Cameron Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251028120900.2265511-2-xiaqinxin@huawei.com --- include/linux/map_benchmark.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/map_benchmark.h b/include/linux/map_benchmark.h index 62674c83bde4..48e2ff95332f 100644 --- a/include/linux/map_benchmark.h +++ b/include/linux/map_benchmark.h @@ -27,5 +27,6 @@ struct map_benchmark { __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; #endif /* _KERNEL_DMA_BENCHMARK_H */ -- cgit v1.2.3 From ed7fc3cbb38ffdca7a189e15982ce96acab4684c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 15 Oct 2025 12:12:47 +0300 Subject: dma-mapping: prepare dma_map_ops to conversion to physical address Add new .map_phys() and .unmap_phys() callbacks to dma_map_ops as a preparation to replace .map_page() and .unmap_page() respectively. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-1-3bbfe3a25cdf@kernel.org --- include/linux/dma-map-ops.h | 7 +++++++ kernel/dma/mapping.c | 4 ++++ kernel/dma/ops_helpers.c | 12 ++++++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 10882d00cb17..79d2a74d4b49 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -37,6 +37,13 @@ struct dma_map_ops { void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs); + + dma_addr_t (*map_phys)(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + void (*unmap_phys)(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs); /* * map_sg should return a negative error code on error. See * dma_map_sgtable() for a list of appropriate error codes diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index fe7472f13b10..4080aebe5deb 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -169,6 +169,8 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, addr = dma_direct_map_phys(dev, phys, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); + else if (ops->map_phys) + addr = ops->map_phys(dev, phys, size, dir, attrs); else if (is_mmio) { if (!ops->map_resource) return DMA_MAPPING_ERROR; @@ -223,6 +225,8 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, dma_direct_unmap_phys(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, addr, size, dir, attrs); + else if (ops->unmap_phys) + ops->unmap_phys(dev, addr, size, dir, attrs); else if (is_mmio) { if (ops->unmap_resource) ops->unmap_resource(dev, addr, size, dir, attrs); diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 6f9d604d9d40..1eccbdbc99c1 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -64,6 +64,7 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); struct page *page; + phys_addr_t phys; page = dma_alloc_contiguous(dev, size, gfp); if (!page) @@ -71,9 +72,13 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; + phys = page_to_phys(page); if (use_dma_iommu(dev)) - *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size, - dir, DMA_ATTR_SKIP_CPU_SYNC); + *dma_handle = iommu_dma_map_phys(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + else if (ops->map_phys) + *dma_handle = ops->map_phys(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); else *dma_handle = ops->map_page(dev, page, 0, size, dir, DMA_ATTR_SKIP_CPU_SYNC); @@ -94,6 +99,9 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); + else if (ops->unmap_phys) + ops->unmap_phys(dev, dma_handle, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); else if (ops->unmap_page) ops->unmap_page(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); -- cgit v1.2.3 From 14cb413af00c5d3950d1a339dd2b6f01ce313fce Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 15 Oct 2025 12:12:52 +0300 Subject: dma-mapping: remove unused mapping resource callbacks After ARM and XEN conversions to use physical addresses for the mapping, there are no in-kernel users for map_resource/unmap_resource callbacks, so remove them. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-6-3bbfe3a25cdf@kernel.org --- include/linux/dma-map-ops.h | 6 ------ kernel/dma/mapping.c | 16 ++++------------ 2 files changed, 4 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 79d2a74d4b49..2e98ecc313a3 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -53,12 +53,6 @@ struct dma_map_ops { enum dma_data_direction dir, unsigned long attrs); void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); - dma_addr_t (*map_resource)(struct device *dev, phys_addr_t phys_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs); - void (*unmap_resource)(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir, - unsigned long attrs); void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void (*sync_single_for_device)(struct device *dev, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 4080aebe5deb..32a85bfdf873 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -157,7 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); bool is_mmio = attrs & DMA_ATTR_MMIO; - dma_addr_t addr; + dma_addr_t addr = DMA_MAPPING_ERROR; BUG_ON(!valid_dma_direction(dir)); @@ -171,18 +171,13 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else if (ops->map_phys) addr = ops->map_phys(dev, phys, size, dir, attrs); - else if (is_mmio) { - if (!ops->map_resource) - return DMA_MAPPING_ERROR; - - addr = ops->map_resource(dev, phys, size, dir, attrs); - } else { + else if (!is_mmio && ops->map_page) { struct page *page = phys_to_page(phys); size_t offset = offset_in_page(phys); /* * The dma_ops API contract for ops->map_page() requires - * kmappable memory, while ops->map_resource() does not. + * kmappable memory. */ addr = ops->map_page(dev, page, offset, size, dir, attrs); } @@ -227,10 +222,7 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else if (ops->unmap_phys) ops->unmap_phys(dev, addr, size, dir, attrs); - else if (is_mmio) { - if (ops->unmap_resource) - ops->unmap_resource(dev, addr, size, dir, attrs); - } else + else ops->unmap_page(dev, addr, size, dir, attrs); trace_dma_unmap_phys(dev, addr, size, dir, attrs); debug_dma_unmap_phys(dev, addr, size, dir); -- cgit v1.2.3 From 131971f67e258170c678fe572fda95f8cef88e66 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 15 Oct 2025 12:13:00 +0300 Subject: dma-mapping: remove unused map_page callback After conversion of arch code to use physical address mapping, there are no users of .map_page() and .unmap_page() callbacks, so let's remove them. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-14-3bbfe3a25cdf@kernel.org --- include/linux/dma-map-ops.h | 7 ------- kernel/dma/mapping.c | 12 ------------ kernel/dma/ops_helpers.c | 8 +------- 3 files changed, 1 insertion(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 2e98ecc313a3..4809204c674c 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -31,13 +31,6 @@ struct dma_map_ops { void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); - dma_addr_t (*map_page)(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs); - void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir, - unsigned long attrs); - dma_addr_t (*map_phys)(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, unsigned long attrs); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 32a85bfdf873..37163eb49f9f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -171,16 +171,6 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else if (ops->map_phys) addr = ops->map_phys(dev, phys, size, dir, attrs); - else if (!is_mmio && ops->map_page) { - struct page *page = phys_to_page(phys); - size_t offset = offset_in_page(phys); - - /* - * The dma_ops API contract for ops->map_page() requires - * kmappable memory. - */ - addr = ops->map_page(dev, page, offset, size, dir, attrs); - } if (!is_mmio) kmsan_handle_dma(phys, size, dir); @@ -222,8 +212,6 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else if (ops->unmap_phys) ops->unmap_phys(dev, addr, size, dir, attrs); - else - ops->unmap_page(dev, addr, size, dir, attrs); trace_dma_unmap_phys(dev, addr, size, dir, attrs); debug_dma_unmap_phys(dev, addr, size, dir); } diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 1eccbdbc99c1..20caf9cabf69 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -76,11 +76,8 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, if (use_dma_iommu(dev)) *dma_handle = iommu_dma_map_phys(dev, phys, size, dir, DMA_ATTR_SKIP_CPU_SYNC); - else if (ops->map_phys) - *dma_handle = ops->map_phys(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); else - *dma_handle = ops->map_page(dev, page, 0, size, dir, + *dma_handle = ops->map_phys(dev, phys, size, dir, DMA_ATTR_SKIP_CPU_SYNC); if (*dma_handle == DMA_MAPPING_ERROR) { dma_free_contiguous(dev, page, size); @@ -102,8 +99,5 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, else if (ops->unmap_phys) ops->unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); - else if (ops->unmap_page) - ops->unmap_page(dev, dma_handle, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); dma_free_contiguous(dev, page, size); } -- cgit v1.2.3 From c31b9d2f589463a7cb286467a618b3b598654890 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Sep 2025 15:44:10 +0200 Subject: unwind: Shorten lines There are some exceptionally long lines that cause ugly wrapping. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt (Google) Link: https://patch.msgid.link/20250924080118.545274393@infradead.org --- include/linux/unwind_deferred.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index 26122d00708a..25f4dffebd1b 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -8,7 +8,9 @@ struct unwind_work; -typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie); +typedef void (*unwind_callback_t)(struct unwind_work *work, + struct unwind_stacktrace *trace, + u64 cookie); struct unwind_work { struct list_head list; @@ -68,9 +70,17 @@ static __always_inline void unwind_reset_info(void) static inline void unwind_task_init(struct task_struct *task) {} static inline void unwind_task_free(struct task_struct *task) {} -static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; } -static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; } -static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; } +static inline int unwind_user_faultable(struct unwind_stacktrace *trace) +{ return -ENOSYS; } + +static inline int +unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) +{ return -ENOSYS; } + +static inline int +unwind_deferred_request(struct unwind_work *work, u64 *timestamp) +{ return -ENOSYS; } + static inline void unwind_deferred_cancel(struct unwind_work *work) {} static inline void unwind_deferred_task_exit(struct task_struct *task) {} -- cgit v1.2.3 From b1164c7d118defb01a885b53f56e3336db784df7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Sep 2025 15:44:59 +0200 Subject: unwind: Add required include files To be self sufficient, the file needs to include linux/types.h. This provides things like u32/u64 and struct callback_head. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt (Google) Link: https://patch.msgid.link/20250924080118.665787071@infradead.org --- include/linux/unwind_deferred_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h index 33b62ac25c86..29452ff49859 100644 --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -2,6 +2,8 @@ #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H +#include + struct unwind_cache { unsigned long unwind_completed; unsigned int nr_entries; -- cgit v1.2.3 From 52a1ec718b3eb6da29a76d05a662365a997139cc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Sep 2025 15:46:00 +0200 Subject: unwind: Simplify unwind_reset_info() Invert the condition of the first if and make it an early exit to reduce an indent level for the rest fo the function. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (Google) Link: https://patch.msgid.link/20250924080118.777916262@infradead.org --- include/linux/unwind_deferred.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index 25f4dffebd1b..196e12c1449e 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -46,22 +46,22 @@ void unwind_deferred_task_exit(struct task_struct *task); static __always_inline void unwind_reset_info(void) { struct unwind_task_info *info = ¤t->unwind_info; - unsigned long bits; + unsigned long bits = info->unwind_mask; /* Was there any unwinding? */ - if (unlikely(info->unwind_mask)) { - bits = info->unwind_mask; - do { - /* Is a task_work going to run again before going back */ - if (bits & UNWIND_PENDING) - return; - } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL)); - current->unwind_info.id.id = 0; - - if (unlikely(info->cache)) { - info->cache->nr_entries = 0; - info->cache->unwind_completed = 0; - } + if (likely(!bits)) + return; + + do { + /* Is a task_work going to run again before going back */ + if (bits & UNWIND_PENDING) + return; + } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL)); + current->unwind_info.id.id = 0; + + if (unlikely(info->cache)) { + info->cache->nr_entries = 0; + info->cache->unwind_completed = 0; } } -- cgit v1.2.3 From 639214f65b1db87c6992eadf93079ff0d8768c2d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Sep 2025 16:09:17 +0200 Subject: unwind: Make unwind_task_info::unwind_mask consistent The unwind_task_info::unwind_mask was manipulated using a mixture of: regular store WRITE_ONCE() try_cmpxchg() set_bit() atomic_long_*() Clean up and make it consistently atomic_long_t. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20250924080119.384384486@infradead.org --- include/linux/unwind_deferred.h | 4 ++-- include/linux/unwind_deferred_types.h | 3 ++- kernel/unwind/deferred.c | 17 +++++++++-------- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index 196e12c1449e..f4743c8cff4c 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -46,7 +46,7 @@ void unwind_deferred_task_exit(struct task_struct *task); static __always_inline void unwind_reset_info(void) { struct unwind_task_info *info = ¤t->unwind_info; - unsigned long bits = info->unwind_mask; + unsigned long bits = atomic_long_read(&info->unwind_mask); /* Was there any unwinding? */ if (likely(!bits)) @@ -56,7 +56,7 @@ static __always_inline void unwind_reset_info(void) /* Is a task_work going to run again before going back */ if (bits & UNWIND_PENDING) return; - } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL)); + } while (!atomic_long_try_cmpxchg(&info->unwind_mask, &bits, 0UL)); current->unwind_info.id.id = 0; if (unlikely(info->cache)) { diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h index 29452ff49859..0a4c8ddbbc57 100644 --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -3,6 +3,7 @@ #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H #include +#include struct unwind_cache { unsigned long unwind_completed; @@ -32,7 +33,7 @@ union unwind_task_id { }; struct unwind_task_info { - unsigned long unwind_mask; + atomic_long_t unwind_mask; struct unwind_cache *cache; struct callback_head work; union unwind_task_id id; diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index 09617d8ae24b..a88fb481c4a3 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -53,7 +53,7 @@ DEFINE_STATIC_SRCU(unwind_srcu); static inline bool unwind_pending(struct unwind_task_info *info) { - return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); + return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING; } /* @@ -141,7 +141,7 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) cache->nr_entries = trace->nr; /* Clear nr_entries on way back to user space */ - set_bit(UNWIND_USED_BIT, &info->unwind_mask); + atomic_long_or(UNWIND_USED, &info->unwind_mask); return 0; } @@ -159,7 +159,7 @@ static void process_unwind_deferred(struct task_struct *task) /* Clear pending bit but make sure to have the current bits */ bits = atomic_long_fetch_andnot(UNWIND_PENDING, - (atomic_long_t *)&info->unwind_mask); + &info->unwind_mask); /* * From here on out, the callback must always be called, even if it's * just an empty trace. @@ -264,7 +264,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) *cookie = get_cookie(info); - old = READ_ONCE(info->unwind_mask); + old = atomic_long_read(&info->unwind_mask); /* Is this already queued or executed */ if (old & bit) @@ -277,7 +277,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * to have a callback. */ bits = UNWIND_PENDING | bit; - old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); + old = atomic_long_fetch_or(bits, &info->unwind_mask); if (old & bits) { /* * If the work's bit was set, whatever set it had better @@ -291,7 +291,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) ret = task_work_add(current, &info->work, twa_mode); if (WARN_ON_ONCE(ret)) - WRITE_ONCE(info->unwind_mask, 0); + atomic_long_set(&info->unwind_mask, 0); return ret; } @@ -323,7 +323,8 @@ void unwind_deferred_cancel(struct unwind_work *work) guard(rcu)(); /* Clear this bit from all threads */ for_each_process_thread(g, t) { - clear_bit(bit, &t->unwind_info.unwind_mask); + atomic_long_andnot(BIT(bit), + &t->unwind_info.unwind_mask); if (t->unwind_info.cache) clear_bit(bit, &t->unwind_info.cache->unwind_completed); } @@ -353,7 +354,7 @@ void unwind_task_init(struct task_struct *task) memset(info, 0, sizeof(*info)); init_task_work(&info->work, unwind_deferred_task_work); - info->unwind_mask = 0; + atomic_long_set(&info->unwind_mask, 0); } void unwind_task_free(struct task_struct *task) -- cgit v1.2.3 From c79dd946e370af3537edb854f210cba3a94b4516 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2025 13:27:34 +0200 Subject: unwind: Implement compat fp unwind It is important to be able to unwind compat tasks too. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20250924080119.613695709@infradead.org --- include/linux/unwind_user_types.h | 1 + kernel/unwind/user.c | 40 ++++++++++++++++++++++++++++----------- 2 files changed, 30 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h index a449f15be890..938f7e623332 100644 --- a/include/linux/unwind_user_types.h +++ b/include/linux/unwind_user_types.h @@ -36,6 +36,7 @@ struct unwind_user_state { unsigned long ip; unsigned long sp; unsigned long fp; + unsigned int ws; enum unwind_user_type current_type; unsigned int available_types; bool done; diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 9dcde797b5d9..642871527a13 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -8,19 +8,32 @@ #include #include -static const struct unwind_user_frame fp_frame = { - ARCH_INIT_USER_FP_FRAME -}; - #define for_each_user_frame(state) \ for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) +static inline int +get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws) +{ + unsigned long __user *addr = (void __user *)base + off; +#ifdef CONFIG_COMPAT + if (ws == sizeof(int)) { + unsigned int data; + int ret = get_user(data, (unsigned int __user *)addr); + *word = data; + return ret; + } +#endif + return get_user(*word, addr); +} + static int unwind_user_next_fp(struct unwind_user_state *state) { - const struct unwind_user_frame *frame = &fp_frame; + const struct unwind_user_frame frame = { + ARCH_INIT_USER_FP_FRAME(state->ws) + }; unsigned long cfa, fp, ra; - if (frame->use_fp) { + if (frame.use_fp) { if (state->fp < state->sp) return -EINVAL; cfa = state->fp; @@ -29,26 +42,26 @@ static int unwind_user_next_fp(struct unwind_user_state *state) } /* Get the Canonical Frame Address (CFA) */ - cfa += frame->cfa_off; + cfa += frame.cfa_off; /* stack going in wrong direction? */ if (cfa <= state->sp) return -EINVAL; /* Make sure that the address is word aligned */ - if (cfa & (sizeof(long) - 1)) + if (cfa & (state->ws - 1)) return -EINVAL; /* Find the Return Address (RA) */ - if (get_user(ra, (unsigned long *)(cfa + frame->ra_off))) + if (get_user_word(&ra, cfa, frame.ra_off, state->ws)) return -EINVAL; - if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off))) + if (frame.fp_off && get_user_word(&fp, cfa, frame.fp_off, state->ws)) return -EINVAL; state->ip = ra; state->sp = cfa; - if (frame->fp_off) + if (frame.fp_off) state->fp = fp; return 0; } @@ -100,6 +113,11 @@ static int unwind_user_start(struct unwind_user_state *state) state->ip = instruction_pointer(regs); state->sp = user_stack_pointer(regs); state->fp = frame_pointer(regs); + state->ws = unwind_user_word_size(regs); + if (!state->ws) { + state->done = true; + return -EINVAL; + } return 0; } -- cgit v1.2.3 From ae25884ad749e7f6e0c3565513bdc8aa2554a425 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Oct 2025 12:31:10 +0200 Subject: unwind_user/x86: Teach FP unwind about start of function When userspace is interrupted at the start of a function, before we get a chance to complete the frame, unwind will miss one caller. X86 has a uprobe specific fixup for this, add bits to the generic unwinder to support this. Suggested-by: Jens Remus Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251024145156.GM4068168@noisy.programming.kicks-ass.net --- arch/x86/events/core.c | 40 -------------------------------------- arch/x86/include/asm/unwind_user.h | 12 ++++++++++++ arch/x86/include/asm/uprobes.h | 9 +++++++++ arch/x86/kernel/uprobes.c | 32 ++++++++++++++++++++++++++++++ include/linux/unwind_user_types.h | 1 + kernel/unwind/user.c | 39 ++++++++++++++++++++++++++++--------- 6 files changed, 84 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 745caa6c15a3..0cf68ad9dcd0 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2845,46 +2845,6 @@ static unsigned long get_segment_base(unsigned int segment) return get_desc_base(desc); } -#ifdef CONFIG_UPROBES -/* - * Heuristic-based check if uprobe is installed at the function entry. - * - * Under assumption of user code being compiled with frame pointers, - * `push %rbp/%ebp` is a good indicator that we indeed are. - * - * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. - * If we get this wrong, captured stack trace might have one extra bogus - * entry, but the rest of stack trace will still be meaningful. - */ -static bool is_uprobe_at_func_entry(struct pt_regs *regs) -{ - struct arch_uprobe *auprobe; - - if (!current->utask) - return false; - - auprobe = current->utask->auprobe; - if (!auprobe) - return false; - - /* push %rbp/%ebp */ - if (auprobe->insn[0] == 0x55) - return true; - - /* endbr64 (64-bit only) */ - if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) - return true; - - return false; -} - -#else -static bool is_uprobe_at_func_entry(struct pt_regs *regs) -{ - return false; -} -#endif /* CONFIG_UPROBES */ - #ifdef CONFIG_IA32_EMULATION #include diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h index b166e102d444..c4f1ff8874d6 100644 --- a/arch/x86/include/asm/unwind_user.h +++ b/arch/x86/include/asm/unwind_user.h @@ -3,6 +3,7 @@ #define _ASM_X86_UNWIND_USER_H #include +#include #define ARCH_INIT_USER_FP_FRAME(ws) \ .cfa_off = 2*(ws), \ @@ -10,6 +11,12 @@ .fp_off = -2*(ws), \ .use_fp = true, +#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \ + .cfa_off = 1*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = 0, \ + .use_fp = false, + static inline int unwind_user_word_size(struct pt_regs *regs) { /* We can't unwind VM86 stacks */ @@ -22,4 +29,9 @@ static inline int unwind_user_word_size(struct pt_regs *regs) return sizeof(long); } +static inline bool unwind_user_at_function_start(struct pt_regs *regs) +{ + return is_uprobe_at_func_entry(regs); +} + #endif /* _ASM_X86_UNWIND_USER_H */ diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 1ee2e5115955..362210c79998 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -62,4 +62,13 @@ struct arch_uprobe_task { unsigned int saved_tf; }; +#ifdef CONFIG_UPROBES +extern bool is_uprobe_at_func_entry(struct pt_regs *regs); +#else +static bool is_uprobe_at_func_entry(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_UPROBES */ + #endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index a563e90832d7..7be8e361ca55 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1791,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, else return regs->sp <= ret->stack; } + +/* + * Heuristic-based check if uprobe is installed at the function entry. + * + * Under assumption of user code being compiled with frame pointers, + * `push %rbp/%ebp` is a good indicator that we indeed are. + * + * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. + * If we get this wrong, captured stack trace might have one extra bogus + * entry, but the rest of stack trace will still be meaningful. + */ +bool is_uprobe_at_func_entry(struct pt_regs *regs) +{ + struct arch_uprobe *auprobe; + + if (!current->utask) + return false; + + auprobe = current->utask->auprobe; + if (!auprobe) + return false; + + /* push %rbp/%ebp */ + if (auprobe->insn[0] == 0x55) + return true; + + /* endbr64 (64-bit only) */ + if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) + return true; + + return false; +} diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h index 938f7e623332..412729a269bc 100644 --- a/include/linux/unwind_user_types.h +++ b/include/linux/unwind_user_types.h @@ -39,6 +39,7 @@ struct unwind_user_state { unsigned int ws; enum unwind_user_type current_type; unsigned int available_types; + bool topmost; bool done; }; diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 642871527a13..39e270789444 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -26,14 +26,12 @@ get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws) return get_user(*word, addr); } -static int unwind_user_next_fp(struct unwind_user_state *state) +static int unwind_user_next_common(struct unwind_user_state *state, + const struct unwind_user_frame *frame) { - const struct unwind_user_frame frame = { - ARCH_INIT_USER_FP_FRAME(state->ws) - }; unsigned long cfa, fp, ra; - if (frame.use_fp) { + if (frame->use_fp) { if (state->fp < state->sp) return -EINVAL; cfa = state->fp; @@ -42,7 +40,7 @@ static int unwind_user_next_fp(struct unwind_user_state *state) } /* Get the Canonical Frame Address (CFA) */ - cfa += frame.cfa_off; + cfa += frame->cfa_off; /* stack going in wrong direction? */ if (cfa <= state->sp) @@ -53,19 +51,41 @@ static int unwind_user_next_fp(struct unwind_user_state *state) return -EINVAL; /* Find the Return Address (RA) */ - if (get_user_word(&ra, cfa, frame.ra_off, state->ws)) + if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; - if (frame.fp_off && get_user_word(&fp, cfa, frame.fp_off, state->ws)) + if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; state->ip = ra; state->sp = cfa; - if (frame.fp_off) + if (frame->fp_off) state->fp = fp; + state->topmost = false; return 0; } +static int unwind_user_next_fp(struct unwind_user_state *state) +{ +#ifdef CONFIG_HAVE_UNWIND_USER_FP + struct pt_regs *regs = task_pt_regs(current); + + if (state->topmost && unwind_user_at_function_start(regs)) { + const struct unwind_user_frame fp_entry_frame = { + ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_entry_frame); + } + + const struct unwind_user_frame fp_frame = { + ARCH_INIT_USER_FP_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_frame); +#else + return -EINVAL; +#endif +} + static int unwind_user_next(struct unwind_user_state *state) { unsigned long iter_mask = state->available_types; @@ -118,6 +138,7 @@ static int unwind_user_start(struct unwind_user_state *state) state->done = true; return -EINVAL; } + state->topmost = true; return 0; } -- cgit v1.2.3 From c69993ecdd4dfde2b7da08b022052a33b203da07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Oct 2025 15:17:05 +0200 Subject: perf: Support deferred user unwind Add support for deferred userspace unwind to perf. Where perf currently relies on in-place stack unwinding; from NMI context and all that. This moves the userspace part of the unwind to right before the return-to-userspace. This has two distinct benefits, the biggest is that it moves the unwind to a faultable context. It becomes possible to fault in debug info (.eh_frame, SFrame etc.) that might not otherwise be readily available. And secondly, it de-duplicates the user callchain where multiple samples happen during the same kernel entry. To facilitate this the perf interface is extended with a new record type: PERF_RECORD_CALLCHAIN_DEFERRED and two new attribute flags: perf_event_attr::defer_callchain - to request the user unwind be deferred perf_event_attr::defer_output - to request PERF_RECORD_CALLCHAIN_DEFERRED records The existing PERF_RECORD_SAMPLE callchain section gets a new context type: PERF_CONTEXT_USER_DEFERRED After which will come a single entry, denoting the 'cookie' of the deferred callchain that should be attached here, matching the 'cookie' field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED. The 'defer_callchain' flag is expected on all events with PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event responsible for collecting side-band events (like mmap, comm etc.). Setting 'defer_output' on multiple events will get you duplicated PERF_RECORD_CALLCHAIN_DEFERRED records. Based on earlier patches by Josh and Steven. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net --- include/linux/perf_event.h | 2 +- include/linux/unwind_deferred.h | 12 ------ include/linux/unwind_deferred_types.h | 13 ++++++ include/uapi/linux/perf_event.h | 21 +++++++++- kernel/bpf/stackmap.c | 4 +- kernel/events/callchain.c | 14 ++++++- kernel/events/core.c | 78 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/perf_event.h | 21 +++++++++- 8 files changed, 145 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fd1d91017b99..9870d768db4c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark); + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index f4743c8cff4c..bc7ae7d21900 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -6,18 +6,6 @@ #include #include -struct unwind_work; - -typedef void (*unwind_callback_t)(struct unwind_work *work, - struct unwind_stacktrace *trace, - u64 cookie); - -struct unwind_work { - struct list_head list; - unwind_callback_t func; - int bit; -}; - #ifdef CONFIG_UNWIND_USER enum { diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h index 0a4c8ddbbc57..18fa3932f61c 100644 --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -39,4 +39,17 @@ struct unwind_task_info { union unwind_task_id id; }; +struct unwind_work; +struct unwind_stacktrace; + +typedef void (*unwind_callback_t)(struct unwind_work *work, + struct unwind_stacktrace *trace, + u64 cookie); + +struct unwind_work { + struct list_head list; + unwind_callback_t func; + int bit; +}; + #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 78a362b80027..d292f96bc06f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -463,7 +463,9 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 24; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1241,22 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 cookie; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1287,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4d53cdd1374c..8f1dacaf01fe 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, kernel, user, max_depth, - false, false); + false, false, 0); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, kernel, user, max_depth, - crosstask, false); + crosstask, false, 0); if (unlikely(!trace) || trace->nr < skip) { if (may_fault) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 808c0d7a31fa..b9c7e00725d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; @@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, regs = task_pt_regs(current); } + if (defer_cookie) { + /* + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED + * which can be stitched to this one, and add + * the cookie after it (it will be cut off when the + * user stack is copied to the callchain). + */ + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); + perf_callchain_store_context(&ctx, defer_cookie); + goto exit_put; + } + if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); diff --git a/kernel/events/core.c b/kernel/events/core.c index 7541f6f85fcb..f6a08c73f783 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "internal.h" @@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; +static struct unwind_work perf_unwind_work; + struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { @@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && + event->attr.defer_callchain; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + u64 defer_cookie; if (!current->mm) user = false; @@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, kernel, user, - max_stack, crosstask, true); + if (!(user && defer_user && !crosstask && + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) + defer_cookie = 0; + + callchain = get_perf_callchain(regs, kernel, user, max_stack, + crosstask, true, defer_cookie); + return callchain ?: &__empty_callchain; } @@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_callchain_deferred_event { + struct unwind_stacktrace *trace; + struct { + struct perf_event_header header; + u64 cookie; + u64 nr; + u64 ips[]; + } event; +}; + +static void perf_callchain_deferred_output(struct perf_event *event, void *data) +{ + struct perf_callchain_deferred_event *deferred_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret, size = deferred_event->event.header.size; + + if (!event->attr.defer_output) + return; + + /* XXX do we really need sample_id_all for this ??? */ + perf_event_header__init_id(&deferred_event->event.header, &sample, event); + + ret = perf_output_begin(&handle, &sample, event, + deferred_event->event.header.size); + if (ret) + goto out; + + perf_output_put(&handle, deferred_event->event); + for (int i = 0; i < deferred_event->trace->nr; i++) { + u64 entry = deferred_event->trace->entries[i]; + perf_output_put(&handle, entry); + } + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + deferred_event->event.header.size = size; +} + +static void perf_unwind_deferred_callback(struct unwind_work *work, + struct unwind_stacktrace *trace, u64 cookie) +{ + struct perf_callchain_deferred_event deferred_event = { + .trace = trace, + .event = { + .header = { + .type = PERF_RECORD_CALLCHAIN_DEFERRED, + .misc = PERF_RECORD_MISC_USER, + .size = sizeof(deferred_event.event) + + (trace->nr * sizeof(u64)), + }, + .cookie = cookie, + .nr = trace->nr, + }, + }; + + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); +} + struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; @@ -14799,6 +14870,9 @@ void __init perf_event_init(void) idr_init(&pmu_idr); + unwind_deferred_init(&perf_unwind_work, + perf_unwind_deferred_callback); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 78a362b80027..d292f96bc06f 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -463,7 +463,9 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 24; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1241,22 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 cookie; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1287,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, -- cgit v1.2.3 From 9c7f7262bc1affb9b9acd2ec2fb1f6314d5d474c Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Wed, 29 Oct 2025 09:12:47 +0100 Subject: regmap: add flat cache with sparse validity The flat regcache will always assume the data in the cache is valid. Since the cache is preferred over hardware access, this may shadow the actual state of the device. Add a new containing cache structure with the flat data table and a bitmap indicating cache validity. REGCACHE_FLAT will still behave as before, as the validity is ignored. Define new cache type REGCACHE_FLAT_S: a flat cache with sparse validity. The sparse validity is used to determine if a hardware access should occur to initialize the cache on the fly, vs. at regmap init for REGCACHE_FLAT. Contrary to REGCACHE_FLAT, this allows us to implement regcache_ops.drop. Signed-off-by: Sander Vanheule Link: https://patch.msgid.link/20251029081248.52607-2-sander@svanheule.net Signed-off-by: Mark Brown --- drivers/base/regmap/internal.h | 1 + drivers/base/regmap/regcache-flat.c | 102 ++++++++++++++++++++++++++++++++---- drivers/base/regmap/regcache.c | 1 + drivers/base/regmap/regmap-kunit.c | 22 ++++++++ include/linux/regmap.h | 17 +++--- 5 files changed, 126 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h index 6f31240ee4a9..8d19a1414d5b 100644 --- a/drivers/base/regmap/internal.h +++ b/drivers/base/regmap/internal.h @@ -288,6 +288,7 @@ enum regmap_endian regmap_get_val_endian(struct device *dev, const struct regmap_bus *bus, const struct regmap_config *config); +extern struct regcache_ops regcache_flat_sparse_ops; extern struct regcache_ops regcache_rbtree_ops; extern struct regcache_ops regcache_maple_ops; extern struct regcache_ops regcache_flat_ops; diff --git a/drivers/base/regmap/regcache-flat.c b/drivers/base/regmap/regcache-flat.c index f36d3618b67c..86f7679175b1 100644 --- a/drivers/base/regmap/regcache-flat.c +++ b/drivers/base/regmap/regcache-flat.c @@ -6,7 +6,11 @@ // // Author: Mark Brown +#include +#include #include +#include +#include #include #include @@ -18,34 +22,62 @@ static inline unsigned int regcache_flat_get_index(const struct regmap *map, return regcache_get_index_by_order(map, reg); } +struct regcache_flat_data { + unsigned long *valid; + unsigned int data[]; +}; + static int regcache_flat_init(struct regmap *map) { int i; - unsigned int *cache; + size_t cache_data_size; + unsigned int cache_size; + struct regcache_flat_data *cache; if (!map || map->reg_stride_order < 0 || !map->max_register_is_set) return -EINVAL; - map->cache = kcalloc(regcache_flat_get_index(map, map->max_register) - + 1, sizeof(unsigned int), map->alloc_flags); - if (!map->cache) + cache_size = regcache_flat_get_index(map, map->max_register) + 1; + cache_data_size = struct_size(cache, data, cache_size); + + if (cache_data_size == SIZE_MAX) { + dev_err(map->dev, "cannot allocate regmap cache"); return -ENOMEM; + } - cache = map->cache; + cache = kzalloc(cache_data_size, map->alloc_flags); + if (!cache) + return -ENOMEM; + + cache->valid = bitmap_zalloc(cache_size, map->alloc_flags); + if (!cache->valid) + goto err_free; + + map->cache = cache; for (i = 0; i < map->num_reg_defaults; i++) { unsigned int reg = map->reg_defaults[i].reg; unsigned int index = regcache_flat_get_index(map, reg); - cache[index] = map->reg_defaults[i].def; + cache->data[index] = map->reg_defaults[i].def; + __set_bit(index, cache->valid); } return 0; + +err_free: + kfree(cache); + return -ENOMEM; } static int regcache_flat_exit(struct regmap *map) { - kfree(map->cache); + struct regcache_flat_data *cache = map->cache; + + if (cache) + bitmap_free(cache->valid); + + kfree(cache); map->cache = NULL; return 0; @@ -54,10 +86,24 @@ static int regcache_flat_exit(struct regmap *map) static int regcache_flat_read(struct regmap *map, unsigned int reg, unsigned int *value) { - unsigned int *cache = map->cache; + struct regcache_flat_data *cache = map->cache; unsigned int index = regcache_flat_get_index(map, reg); - *value = cache[index]; + *value = cache->data[index]; + + return 0; +} + +static int regcache_flat_sparse_read(struct regmap *map, + unsigned int reg, unsigned int *value) +{ + struct regcache_flat_data *cache = map->cache; + unsigned int index = regcache_flat_get_index(map, reg); + + if (unlikely(!test_bit(index, cache->valid))) + return -ENOENT; + + *value = cache->data[index]; return 0; } @@ -65,10 +111,34 @@ static int regcache_flat_read(struct regmap *map, static int regcache_flat_write(struct regmap *map, unsigned int reg, unsigned int value) { - unsigned int *cache = map->cache; + struct regcache_flat_data *cache = map->cache; unsigned int index = regcache_flat_get_index(map, reg); - cache[index] = value; + cache->data[index] = value; + + return 0; +} + +static int regcache_flat_sparse_write(struct regmap *map, unsigned int reg, + unsigned int value) +{ + struct regcache_flat_data *cache = map->cache; + unsigned int index = regcache_flat_get_index(map, reg); + + cache->data[index] = value; + __set_bit(index, cache->valid); + + return 0; +} + +static int regcache_flat_drop(struct regmap *map, unsigned int min, + unsigned int max) +{ + struct regcache_flat_data *cache = map->cache; + unsigned int bitmap_min = regcache_flat_get_index(map, min); + unsigned int bitmap_max = regcache_flat_get_index(map, max); + + bitmap_clear(cache->valid, bitmap_min, bitmap_max + 1 - bitmap_min); return 0; } @@ -81,3 +151,13 @@ struct regcache_ops regcache_flat_ops = { .read = regcache_flat_read, .write = regcache_flat_write, }; + +struct regcache_ops regcache_flat_sparse_ops = { + .type = REGCACHE_FLAT_S, + .name = "flat-sparse", + .init = regcache_flat_init, + .exit = regcache_flat_exit, + .read = regcache_flat_sparse_read, + .write = regcache_flat_sparse_write, + .drop = regcache_flat_drop, +}; diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c index c7650fa434ad..0392f5525cf3 100644 --- a/drivers/base/regmap/regcache.c +++ b/drivers/base/regmap/regcache.c @@ -16,6 +16,7 @@ #include "internal.h" static const struct regcache_ops *cache_types[] = { + ®cache_flat_sparse_ops, ®cache_rbtree_ops, ®cache_maple_ops, ®cache_flat_ops, diff --git a/drivers/base/regmap/regmap-kunit.c b/drivers/base/regmap/regmap-kunit.c index 95c5bf2a78ee..f6fc5ed016da 100644 --- a/drivers/base/regmap/regmap-kunit.c +++ b/drivers/base/regmap/regmap-kunit.c @@ -54,6 +54,8 @@ static const char *regcache_type_name(enum regcache_type type) return "none"; case REGCACHE_FLAT: return "flat"; + case REGCACHE_FLAT_S: + return "flat-sparse"; case REGCACHE_RBTREE: return "rbtree"; case REGCACHE_MAPLE: @@ -93,6 +95,8 @@ static const struct regmap_test_param regcache_types_list[] = { { .cache = REGCACHE_NONE, .fast_io = true }, { .cache = REGCACHE_FLAT }, { .cache = REGCACHE_FLAT, .fast_io = true }, + { .cache = REGCACHE_FLAT_S }, + { .cache = REGCACHE_FLAT_S, .fast_io = true }, { .cache = REGCACHE_RBTREE }, { .cache = REGCACHE_RBTREE, .fast_io = true }, { .cache = REGCACHE_MAPLE }, @@ -104,6 +108,8 @@ KUNIT_ARRAY_PARAM(regcache_types, regcache_types_list, param_to_desc); static const struct regmap_test_param real_cache_types_only_list[] = { { .cache = REGCACHE_FLAT }, { .cache = REGCACHE_FLAT, .fast_io = true }, + { .cache = REGCACHE_FLAT_S }, + { .cache = REGCACHE_FLAT_S, .fast_io = true }, { .cache = REGCACHE_RBTREE }, { .cache = REGCACHE_RBTREE, .fast_io = true }, { .cache = REGCACHE_MAPLE }, @@ -119,6 +125,12 @@ static const struct regmap_test_param real_cache_types_list[] = { { .cache = REGCACHE_FLAT, .from_reg = 0x2002 }, { .cache = REGCACHE_FLAT, .from_reg = 0x2003 }, { .cache = REGCACHE_FLAT, .from_reg = 0x2004 }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0 }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0, .fast_io = true }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0x2001 }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0x2002 }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0x2003 }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0x2004 }, { .cache = REGCACHE_RBTREE, .from_reg = 0 }, { .cache = REGCACHE_RBTREE, .from_reg = 0, .fast_io = true }, { .cache = REGCACHE_RBTREE, .from_reg = 0x2001 }, @@ -136,6 +148,12 @@ static const struct regmap_test_param real_cache_types_list[] = { KUNIT_ARRAY_PARAM(real_cache_types, real_cache_types_list, param_to_desc); static const struct regmap_test_param sparse_cache_types_list[] = { + { .cache = REGCACHE_FLAT_S, .from_reg = 0 }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0, .fast_io = true }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0x2001 }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0x2002 }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0x2003 }, + { .cache = REGCACHE_FLAT_S, .from_reg = 0x2004 }, { .cache = REGCACHE_RBTREE, .from_reg = 0 }, { .cache = REGCACHE_RBTREE, .from_reg = 0, .fast_io = true }, { .cache = REGCACHE_RBTREE, .from_reg = 0x2001 }, @@ -1597,6 +1615,8 @@ static const struct regmap_test_param raw_types_list[] = { { .cache = REGCACHE_NONE, .val_endian = REGMAP_ENDIAN_BIG }, { .cache = REGCACHE_FLAT, .val_endian = REGMAP_ENDIAN_LITTLE }, { .cache = REGCACHE_FLAT, .val_endian = REGMAP_ENDIAN_BIG }, + { .cache = REGCACHE_FLAT_S, .val_endian = REGMAP_ENDIAN_LITTLE }, + { .cache = REGCACHE_FLAT_S, .val_endian = REGMAP_ENDIAN_BIG }, { .cache = REGCACHE_RBTREE, .val_endian = REGMAP_ENDIAN_LITTLE }, { .cache = REGCACHE_RBTREE, .val_endian = REGMAP_ENDIAN_BIG }, { .cache = REGCACHE_MAPLE, .val_endian = REGMAP_ENDIAN_LITTLE }, @@ -1608,6 +1628,8 @@ KUNIT_ARRAY_PARAM(raw_test_types, raw_types_list, param_to_desc); static const struct regmap_test_param raw_cache_types_list[] = { { .cache = REGCACHE_FLAT, .val_endian = REGMAP_ENDIAN_LITTLE }, { .cache = REGCACHE_FLAT, .val_endian = REGMAP_ENDIAN_BIG }, + { .cache = REGCACHE_FLAT_S, .val_endian = REGMAP_ENDIAN_LITTLE }, + { .cache = REGCACHE_FLAT_S, .val_endian = REGMAP_ENDIAN_BIG }, { .cache = REGCACHE_RBTREE, .val_endian = REGMAP_ENDIAN_LITTLE }, { .cache = REGCACHE_RBTREE, .val_endian = REGMAP_ENDIAN_BIG }, { .cache = REGCACHE_MAPLE, .val_endian = REGMAP_ENDIAN_LITTLE }, diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 4e1ac1fbcec4..17bed25dc4e3 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -55,18 +55,23 @@ struct sdw_slave; #define REGMAP_DOWNSHIFT(s) (s) /* - * The supported cache types, the default is no cache. Any new caches - * should usually use the maple tree cache unless they specifically - * require that there are never any allocations at runtime and can't - * provide defaults in which case they should use the flat cache. The - * rbtree cache *may* have some performance advantage for very low end - * systems that make heavy use of cache syncs but is mainly legacy. + * The supported cache types, the default is no cache. Any new caches should + * usually use the maple tree cache unless they specifically require that there + * are never any allocations at runtime in which case they should use the sparse + * flat cache. The rbtree cache *may* have some performance advantage for very + * low end systems that make heavy use of cache syncs but is mainly legacy. + * These caches are sparse and entries will be initialized from hardware if no + * default has been provided. + * The non-sparse flat cache is provided for compatibility with existing users + * and will zero-initialize cache entries for which no defaults are provided. + * New users should use the sparse flat cache. */ enum regcache_type { REGCACHE_NONE, REGCACHE_RBTREE, REGCACHE_FLAT, REGCACHE_MAPLE, + REGCACHE_FLAT_S, }; /** -- cgit v1.2.3 From 7fabcb7fbabbcddd9dc42dbe4c92d18ce3e54283 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:17 +0200 Subject: mm,btrfs: add a filemap_flush_nr helper Abstract out the btrfs-specific behavior of kicking off I/O on a number of pages on an address_space into a well-defined helper. Note: there is no kerneldoc comment for the new function because it is not part of the public API. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-7-hch@lst.de Reviewed-by: David Hildenbrand Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/btrfs/inode.c | 13 ++----------- include/linux/pagemap.h | 1 + mm/filemap.c | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b97d6c1f7772..d12b8116adde 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8752,19 +8752,10 @@ static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - struct writeback_control wbc = { - .nr_to_write = *nr_to_write, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; - - ret = filemap_fdatawrite_wbc(tmp_inode->i_mapping, - &wbc); + ret = filemap_flush_nr(tmp_inode->i_mapping, + nr_to_write); btrfs_add_delayed_iput(inode); - if (*nr_to_write != LONG_MAX) - *nr_to_write = wbc.nr_to_write; if (ret || *nr_to_write <= 0) goto out; } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..cebdf160d3dd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -38,6 +38,7 @@ int filemap_invalidate_pages(struct address_space *mapping, int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); int filemap_flush(struct address_space *); +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write); int filemap_fdatawait_keep_errors(struct address_space *mapping); int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); int filemap_fdatawait_range_keep_errors(struct address_space *mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 99d6919af60d..e344b79a012d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -474,6 +474,28 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); +/* + * Start writeback on @nr_to_write pages from @mapping. No one but the existing + * btrfs caller should be using this. Talk to linux-mm if you think adding a + * new caller is a good idea. + */ +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) +{ + struct writeback_control wbc = { + .nr_to_write = *nr_to_write, + .sync_mode = WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + }; + int ret; + + ret = filemap_fdatawrite_wbc(mapping, &wbc); + if (!ret) + *nr_to_write = wbc.nr_to_write; + return ret; +} +EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); + /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check -- cgit v1.2.3 From 1bcb413d0cd80efb386751910036a93147fd8dbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:19 +0200 Subject: mm: remove filemap_fdatawrite_wbc Replace filemap_fdatawrite_wbc, which exposes a writeback_control to the callers with a filemap_writeback helper that takes all the possible arguments and declares the writeback_control itself. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-9-hch@lst.de Reviewed-by: David Hildenbrand Reviewed-by: Jan Kara Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 6 +++--- include/linux/pagemap.h | 2 -- mm/filemap.c | 54 +++++++++++++++++-------------------------------- 3 files changed, 21 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e76192d140e3..4448de35ec8b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -822,9 +822,9 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, * @wbc: writeback_control of interest * @inode: target inode * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. + * This function is to be used by filemap_writeback(), which is an alternative + * entry point into writeback code, and first ensures @inode is associated with + * a bdi_writeback and attaches it to @wbc. */ void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cebdf160d3dd..678d8ae23d01 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -60,8 +60,6 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); void __filemap_set_wb_err(struct address_space *mapping, int err); -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc); int kiocb_write_and_wait(struct kiocb *iocb, size_t count); static inline int filemap_write_and_wait(struct address_space *mapping) diff --git a/mm/filemap.c b/mm/filemap.c index 3d4c4a96c586..7126d0587c94 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -366,31 +366,30 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) return 0; } -/** - * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @wbc: the writeback_control controlling the writeout - * - * Call writepages on the mapping using the provided wbc to control the - * writeout. - * - * Return: %0 on success, negative error code otherwise. - */ -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc) +static int filemap_writeback(struct address_space *mapping, loff_t start, + loff_t end, enum writeback_sync_modes sync_mode, + long *nr_to_write) { + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX, + .range_start = start, + .range_end = end, + }; int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - wbc_attach_fdatawrite_inode(wbc, mapping->host); - ret = do_writepages(mapping, wbc); - wbc_detach_inode(wbc); + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); + + if (!ret && nr_to_write) + *nr_to_write = wbc.nr_to_write; return ret; } -EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range @@ -412,14 +411,7 @@ EXPORT_SYMBOL(filemap_fdatawrite_wbc); int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) { - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); + return filemap_writeback(mapping, start, end, sync_mode, NULL); } int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, @@ -475,18 +467,8 @@ EXPORT_SYMBOL(filemap_flush); */ int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) { - struct writeback_control wbc = { - .nr_to_write = *nr_to_write, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; - int ret; - - ret = filemap_fdatawrite_wbc(mapping, &wbc); - if (!ret) - *nr_to_write = wbc.nr_to_write; - return ret; + return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE, + nr_to_write); } EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); -- cgit v1.2.3 From 45cbce5b8877f339b72548f60aa97634044c255c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:20 +0200 Subject: mm: remove __filemap_fdatawrite_range Use filemap_fdatawrite_range and filemap_fdatawrite_range_kick instead of the low-level __filemap_fdatawrite_range that requires the caller to know the internals of the writeback_control structure and remove __filemap_fdatawrite_range now that it is trivial and only two callers would be left. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-10-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- fs/sync.c | 11 +++++------ include/linux/pagemap.h | 2 -- mm/fadvise.c | 3 +-- mm/filemap.c | 25 +++++++------------------ 4 files changed, 13 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/fs/sync.c b/fs/sync.c index 2955cd4c77a3..6d8b04e04c3c 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -280,14 +280,13 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { - int sync_mode = WB_SYNC_NONE; - if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == SYNC_FILE_RANGE_WRITE_AND_WAIT) - sync_mode = WB_SYNC_ALL; - - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - sync_mode); + ret = filemap_fdatawrite_range(mapping, offset, + endbyte); + else + ret = filemap_fdatawrite_range_kick(mapping, offset, + endbyte); if (ret < 0) goto out; } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 678d8ae23d01..d0a7dd43c835 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -54,8 +54,6 @@ static inline int filemap_fdatawait(struct address_space *mapping) bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); -int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); diff --git a/mm/fadvise.c b/mm/fadvise.c index 588fe76c5a14..f1be619f0e58 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -111,8 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: - __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + filemap_fdatawrite_range_kick(mapping, offset, endbyte); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/filemap.c b/mm/filemap.c index 7126d0587c94..f90f5bb2b825 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -392,32 +392,23 @@ static int filemap_writeback(struct address_space *mapping, loff_t start, } /** - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets inclusive. * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. + * This is a data integrity operation that waits upon dirty or in writeback + * pages. * * Return: %0 on success, negative error code otherwise. */ -int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - return filemap_writeback(mapping, start, end, sync_mode, NULL); -} - int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); + return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL); } EXPORT_SYMBOL(filemap_fdatawrite_range); @@ -441,7 +432,7 @@ EXPORT_SYMBOL(filemap_fdatawrite); int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, loff_t end) { - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); + return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL); } EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); @@ -689,8 +680,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. @@ -792,8 +782,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); -- cgit v1.2.3 From c28d67b33cbf6da2043ee7517f1aa4cbf92dbbba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:21 +0200 Subject: mm: rename filemap_fdatawrite_range_kick to filemap_flush_range Rename filemap_fdatawrite_range_kick to filemap_flush_range because it is the ranged version of filemap_flush. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-11-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/sync.c | 3 +-- include/linux/fs.h | 6 +++--- mm/fadvise.c | 2 +- mm/filemap.c | 8 ++++---- 4 files changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/sync.c b/fs/sync.c index 6d8b04e04c3c..1759f6ba36cd 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -285,8 +285,7 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, ret = filemap_fdatawrite_range(mapping, offset, endbyte); else - ret = filemap_fdatawrite_range_kick(mapping, offset, - endbyte); + ret = filemap_flush_range(mapping, offset, endbyte); if (ret < 0) goto out; } diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..a5dbfa20f8d7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3014,7 +3014,7 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) @@ -3051,8 +3051,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; - filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, - iocb->ki_pos - 1); + filemap_flush_range(mapping, iocb->ki_pos - count, + iocb->ki_pos - 1); } return count; diff --git a/mm/fadvise.c b/mm/fadvise.c index f1be619f0e58..67028e30aa91 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -111,7 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: - filemap_fdatawrite_range_kick(mapping, offset, endbyte); + filemap_flush_range(mapping, offset, endbyte); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/filemap.c b/mm/filemap.c index f90f5bb2b825..fa770768ea3a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -419,7 +419,7 @@ int filemap_fdatawrite(struct address_space *mapping) EXPORT_SYMBOL(filemap_fdatawrite); /** - * filemap_fdatawrite_range_kick - start writeback on a range + * filemap_flush_range - start writeback on a range * @mapping: target address_space * @start: index to start writeback on * @end: last (inclusive) index for writeback @@ -429,12 +429,12 @@ EXPORT_SYMBOL(filemap_fdatawrite); * * Return: %0 on success, negative error code otherwise. */ -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end) { return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL); } -EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); +EXPORT_SYMBOL_GPL(filemap_flush_range); /** * filemap_flush - mostly a non-blocking flush @@ -447,7 +447,7 @@ EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); */ int filemap_flush(struct address_space *mapping) { - return filemap_fdatawrite_range_kick(mapping, 0, LLONG_MAX); + return filemap_flush_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_flush); -- cgit v1.2.3 From 90db4d4441f58d433ecf74f7e3bd17e0a553c20c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Oct 2025 05:45:48 +0200 Subject: writeback: allow the file system to override MIN_WRITEBACK_PAGES The relatively low minimal writeback size of 4MiB means that written back inodes on rotational media are switched a lot. Besides introducing additional seeks, this also can lead to extreme file fragmentation on zoned devices when a lot of files are cached relative to the available writeback bandwidth. Add a superblock field that allows the file system to override the default size. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251017034611.651385-3-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 14 +++++--------- fs/super.c | 1 + include/linux/fs.h | 1 + include/linux/writeback.h | 5 +++++ 4 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 30de37865fa1..52763fa499d6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -32,11 +32,6 @@ #include #include "internal.h" -/* - * 4MB minimal write chunk size - */ -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -1889,8 +1884,8 @@ out: return ret; } -static long writeback_chunk_size(struct bdi_writeback *wb, - struct wb_writeback_work *work) +static long writeback_chunk_size(struct super_block *sb, + struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -1913,7 +1908,8 @@ static long writeback_chunk_size(struct bdi_writeback *wb, pages = min(wb->avg_write_bandwidth / 2, global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); - return round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); + return round_down(pages + sb->s_min_writeback_pages, + sb->s_min_writeback_pages); } /* @@ -2015,7 +2011,7 @@ static long writeback_sb_inodes(struct super_block *sb, inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb, work); + write_chunk = writeback_chunk_size(inode->i_sb, wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..599c1d2641fe 100644 --- a/fs/super.c +++ b/fs/super.c @@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; + s->s_min_writeback_pages = MIN_WRITEBACK_PAGES; return s; fail: diff --git a/include/linux/fs.h b/include/linux/fs.h index a5dbfa20f8d7..6bf369095d2e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1583,6 +1583,7 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ + long s_min_writeback_pages; } __randomize_layout; static inline struct user_namespace *i_user_ns(const struct inode *inode) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 22dd4adc5667..49e1dd96f43e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) + #endif /* WRITEBACK_H */ -- cgit v1.2.3 From 4952f35f0545f3b53dab8d5fd727c4827c2a2778 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Mon, 29 Sep 2025 19:13:49 +0800 Subject: fs: Make wbc_to_tag() inline and use it in fs. The logic in wbc_to_tag() is widely used in file systems, so modify this function to be inline and use it in file systems. This patch has only passed compilation tests, but it should be fine. Signed-off-by: Julian Sun Reviewed-by: Qu Wenruo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/btrfs/extent_io.c | 5 +---- fs/ceph/addr.c | 6 +----- fs/ext4/inode.c | 5 +---- fs/f2fs/data.c | 5 +---- fs/gfs2/aops.c | 5 +---- include/linux/writeback.h | 7 +++++++ mm/page-writeback.c | 6 ------ 7 files changed, 12 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c123a3ef154a..170dd7e80d11 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2460,10 +2460,7 @@ static int extent_write_cache_pages(struct address_space *mapping, &BTRFS_I(inode)->runtime_flags)) wbc->tagged_writepages = 1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 322ed268f14a..63b75d214210 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1045,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping, ceph_wbc->index = ceph_wbc->start_index; ceph_wbc->end = -1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; - } else { - ceph_wbc->tag = PAGECACHE_TAG_DIRTY; - } + ceph_wbc->tag = wbc_to_tag(wbc); ceph_wbc->op_idx = -1; ceph_wbc->num_ops = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9e4ac87211e..58d6194045e2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2619,10 +2619,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_folio(mpd->inode); - if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(mpd->wbc); mpd->map.m_len = 0; mpd->next_pos = mpd->start_pos; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ef38e62cda8f..826bcfb8230c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2986,10 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 47d74afd63ac..12394fc5dd29 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -311,10 +311,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 49e1dd96f43e..2a81816f7507 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -196,6 +196,13 @@ static inline void wait_on_inode(struct inode *inode) !(READ_ONCE(inode->i_state) & I_NEW)); } +static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + return PAGECACHE_TAG_TOWRITE; + return PAGECACHE_TAG_DIRTY; +} + #ifdef CONFIG_CGROUP_WRITEBACK #include diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 757bc4d3b5b5..a124ab6a205d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2434,12 +2434,6 @@ static bool folio_prepare_writeback(struct address_space *mapping, return true; } -static xa_mark_t wbc_to_tag(struct writeback_control *wbc) -{ - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - return PAGECACHE_TAG_TOWRITE; - return PAGECACHE_TAG_DIRTY; -} static pgoff_t wbc_end(struct writeback_control *wbc) { -- cgit v1.2.3 From f0e7036fc9cb08bdfb27d64eee7fc003ba0bc2e5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 27 Oct 2025 10:22:30 +0200 Subject: ipv4: icmp: Add RFC 5837 support Add the ability to append the incoming IP interface information to ICMPv4 error messages in accordance with RFC 5837 and RFC 4884. This is required for more meaningful traceroute results in unnumbered networks. The feature is disabled by default and controlled via a new sysctl ("net.ipv4.icmp_errors_extension_mask") which accepts a bitmask of ICMP extensions to append to ICMP error messages. Currently, only a single value is supported, but the interface and the implementation should be able to support more extensions, if needed. Clone the skb and copy the relevant data portions before modifying the skb as the caller of __icmp_send() still owns the skb after the function returns. This should be fine since by default ICMP error messages are rate limited to 1000 per second and no more than 1 per second per specific host. Trim or pad the packet to 128 bytes before appending the ICMP extension structure in order to be compatible with legacy applications that assume that the ICMP extension structure always starts at this offset (the minimum length specified by RFC 4884). Reviewed-by: Petr Machata Reviewed-by: David Ahern Reviewed-by: Willem de Bruijn Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251027082232.232571-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 17 +++ include/linux/icmp.h | 32 ++++++ include/net/netns/ipv4.h | 1 + net/ipv4/icmp.c | 191 ++++++++++++++++++++++++++++++++- net/ipv4/sysctl_net_ipv4.c | 11 ++ 5 files changed, 251 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index a06cb99d66dc..ece1187ba0f1 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1796,6 +1796,23 @@ icmp_errors_use_inbound_ifaddr - BOOLEAN Default: 0 (disabled) +icmp_errors_extension_mask - UNSIGNED INTEGER + Bitmask of ICMP extensions to append to ICMPv4 error messages + ("Destination Unreachable", "Time Exceeded" and "Parameter Problem"). + The original datagram is trimmed / padded to 128 bytes in order to be + compatible with applications that do not comply with RFC 4884. + + Possible extensions are: + + ==== ============================================================== + 0x01 Incoming IP interface information according to RFC 5837. + Extension will include the index, IPv4 address (if present), + name and MTU of the IP interface that received the datagram + which elicited the ICMP error. + ==== ============================================================== + + Default: 0x00 (no extensions) + igmp_max_memberships - INTEGER Change the maximum number of multicast groups we can subscribe to. Default: 20 diff --git a/include/linux/icmp.h b/include/linux/icmp.h index 0af4d210ee31..043ec5d9c882 100644 --- a/include/linux/icmp.h +++ b/include/linux/icmp.h @@ -40,4 +40,36 @@ void ip_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out, int thlen, int off); +/* RFC 4884 */ +#define ICMP_EXT_ORIG_DGRAM_MIN_LEN 128 +#define ICMP_EXT_VERSION_2 2 + +/* ICMP Extension Object Classes */ +#define ICMP_EXT_OBJ_CLASS_IIO 2 /* RFC 5837 */ + +/* Interface Information Object - RFC 5837 */ +enum { + ICMP_EXT_CTYPE_IIO_ROLE_IIF, +}; + +#define ICMP_EXT_CTYPE_IIO_ROLE(ROLE) ((ROLE) << 6) +#define ICMP_EXT_CTYPE_IIO_MTU BIT(0) +#define ICMP_EXT_CTYPE_IIO_NAME BIT(1) +#define ICMP_EXT_CTYPE_IIO_IPADDR BIT(2) +#define ICMP_EXT_CTYPE_IIO_IFINDEX BIT(3) + +struct icmp_ext_iio_name_subobj { + u8 len; + char name[IFNAMSIZ]; +}; + +enum { + /* RFC 5837 - Incoming IP Interface Role */ + ICMP_ERR_EXT_IIO_IIF, + /* Add new constants above. Used by "icmp_errors_extension_mask" + * sysctl. + */ + ICMP_ERR_EXT_COUNT, +}; + #endif /* _LINUX_ICMP_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 34eb3aecb3f2..0e96c90e56c6 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -135,6 +135,7 @@ struct netns_ipv4 { u8 sysctl_icmp_echo_ignore_broadcasts; u8 sysctl_icmp_ignore_bogus_error_responses; u8 sysctl_icmp_errors_use_inbound_ifaddr; + u8 sysctl_icmp_errors_extension_mask; int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; int sysctl_icmp_msgs_per_sec; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1b7fb5d935ed..4abbec2f47ef 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -582,6 +582,185 @@ relookup_failed: return ERR_PTR(err); } +struct icmp_ext_iio_addr4_subobj { + __be16 afi; + __be16 reserved; + __be32 addr4; +}; + +static unsigned int icmp_ext_iio_len(void) +{ + return sizeof(struct icmp_extobj_hdr) + + /* ifIndex */ + sizeof(__be32) + + /* Interface Address Sub-Object */ + sizeof(struct icmp_ext_iio_addr4_subobj) + + /* Interface Name Sub-Object. Length must be a multiple of 4 + * bytes. + */ + ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) + + /* MTU */ + sizeof(__be32); +} + +static unsigned int icmp_ext_max_len(u8 ext_objs) +{ + unsigned int ext_max_len; + + ext_max_len = sizeof(struct icmp_ext_hdr); + + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + ext_max_len += icmp_ext_iio_len(); + + return ext_max_len; +} + +static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev) +{ + struct in_device *in_dev; + struct in_ifaddr *ifa; + + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + return 0; + + /* It is unclear from RFC 5837 which IP address should be chosen, but + * it makes sense to choose a global unicast address. + */ + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) + continue; + if (ifa->ifa_scope != RT_SCOPE_UNIVERSE || + ipv4_is_multicast(ifa->ifa_address)) + continue; + return ifa->ifa_address; + } + + return 0; +} + +static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb, + int iif) +{ + struct icmp_ext_iio_name_subobj *name_subobj; + struct icmp_extobj_hdr *objh; + struct net_device *dev; + __be32 data; + + if (!iif) + return; + + /* Add the fields in the order specified by RFC 5837. */ + objh = skb_put(skb, sizeof(*objh)); + objh->class_num = ICMP_EXT_OBJ_CLASS_IIO; + objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF); + + data = htonl(iif); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, iif); + if (!dev) + goto out; + + data = icmp_ext_iio_addr4_find(dev); + if (data) { + struct icmp_ext_iio_addr4_subobj *addr4_subobj; + + addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj)); + addr4_subobj->afi = htons(ICMP_AFI_IP); + addr4_subobj->addr4 = data; + objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR; + } + + name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4)); + name_subobj->len = ALIGN(sizeof(*name_subobj), 4); + netdev_copy_name(dev, name_subobj->name); + objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME; + + data = htonl(READ_ONCE(dev->mtu)); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU; + +out: + rcu_read_unlock(); + objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh); +} + +static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb, + u8 ext_objs, int iif) +{ + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + icmp_ext_iio_iif_append(net, skb, iif); +} + +static struct sk_buff * +icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph, + unsigned int room, int iif) +{ + unsigned int payload_len, ext_max_len, ext_len; + struct icmp_ext_hdr *ext_hdr; + struct sk_buff *skb; + u8 ext_objs; + int nhoff; + + switch (icmph->type) { + case ICMP_DEST_UNREACH: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return NULL; + } + + ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask); + if (!ext_objs) + return NULL; + + ext_max_len = icmp_ext_max_len(ext_objs); + if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room) + return NULL; + + skb = skb_clone(skb_in, GFP_ATOMIC); + if (!skb) + return NULL; + + nhoff = skb_network_offset(skb); + payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN); + + if (!pskb_network_may_pull(skb, payload_len)) + goto free_skb; + + if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) || + __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false)) + goto free_skb; + + if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC)) + goto free_skb; + + ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr)); + ext_hdr->version = ICMP_EXT_VERSION_2; + + icmp_ext_objs_append(net, skb, ext_objs, iif); + + /* Do not send an empty extension structure. */ + ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr; + if (ext_len == sizeof(*ext_hdr)) + goto free_skb; + + ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len); + /* The length of the original datagram in 32-bit words (RFC 4884). */ + icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32); + + return skb; + +free_skb: + consume_skb(skb); + return NULL; +} + /* * Send an ICMP message in response to a situation * @@ -601,6 +780,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); bool apply_ratelimit = false; + struct sk_buff *ext_skb; struct ipcm_cookie ipc; struct flowi4 fl4; __be32 saddr; @@ -770,7 +950,12 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (room <= (int)sizeof(struct iphdr)) goto ende; - icmp_param.data_len = skb_in->len - icmp_param.offset; + ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room, + parm->iif); + if (ext_skb) + icmp_param.skb = ext_skb; + + icmp_param.data_len = icmp_param.skb->len - icmp_param.offset; if (icmp_param.data_len > room) icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); @@ -785,6 +970,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, trace_icmp_send(skb_in, type, code); icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); + + if (ext_skb) + consume_skb(ext_skb); ende: ip_rt_put(rt); out_unlock: @@ -1502,6 +1690,7 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; net->ipv4.sysctl_icmp_ratemask = 0x1818; net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + net->ipv4.sysctl_icmp_errors_extension_mask = 0; net->ipv4.sysctl_icmp_msgs_per_sec = 1000; net->ipv4.sysctl_icmp_msgs_burst = 50; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 24dbc603cc44..0c7c8f9041cb 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -48,6 +48,8 @@ static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; static int tcp_ecn_mode_max = 2; +static u32 icmp_errors_extension_mask_all = + GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -674,6 +676,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, + { + .procname = "icmp_errors_extension_mask", + .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &icmp_errors_extension_mask_all, + }, { .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, -- cgit v1.2.3 From 26888de97b2ffe0267c12dd4e9fcd552545903f1 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 25 Oct 2025 20:49:50 +0200 Subject: net: phy: add iterator mdiobus_for_each_phy Add an iterator for all PHY's on a MII bus, and phy_find_next() as a prerequisite. Signed-off-by: Heiner Kallweit Reviewed-by: Wei Fang Link: https://patch.msgid.link/cd112f15-401a-43d9-8525-9ff0965a68cd@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 16 +++++++++------- include/linux/phy.h | 11 ++++++++++- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index b7feaf0cb1df..737747cf1906 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1224,22 +1224,24 @@ int phy_get_c45_ids(struct phy_device *phydev) EXPORT_SYMBOL(phy_get_c45_ids); /** - * phy_find_first - finds the first PHY device on the bus + * phy_find_next - finds the next PHY device on the bus * @bus: the target MII bus + * @pos: cursor + * + * Return: next phy_device on the bus, or NULL */ -struct phy_device *phy_find_first(struct mii_bus *bus) +struct phy_device *phy_find_next(struct mii_bus *bus, struct phy_device *pos) { - struct phy_device *phydev; - int addr; + for (int addr = pos ? pos->mdio.addr + 1 : 0; + addr < PHY_MAX_ADDR; addr++) { + struct phy_device *phydev = mdiobus_get_phy(bus, addr); - for (addr = 0; addr < PHY_MAX_ADDR; addr++) { - phydev = mdiobus_get_phy(bus, addr); if (phydev) return phydev; } return NULL; } -EXPORT_SYMBOL(phy_find_first); +EXPORT_SYMBOL_GPL(phy_find_next); /** * phy_prepare_link - prepares the PHY layer to monitor link status diff --git a/include/linux/phy.h b/include/linux/phy.h index 17a2cdc9f1a0..358dd6f0ff96 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1869,7 +1869,7 @@ int phy_sfp_probe(struct phy_device *phydev, const struct sfp_upstream_ops *ops); struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phy_interface_t interface); -struct phy_device *phy_find_first(struct mii_bus *bus); +struct phy_device *phy_find_next(struct mii_bus *bus, struct phy_device *pos); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); int phy_connect_direct(struct net_device *dev, struct phy_device *phydev, @@ -1896,6 +1896,15 @@ bool phy_check_valid(int speed, int duplex, unsigned long *features); int phy_restart_aneg(struct phy_device *phydev); int phy_reset_after_clk_enable(struct phy_device *phydev); +static inline struct phy_device *phy_find_first(struct mii_bus *bus) +{ + return phy_find_next(bus, NULL); +} + +#define mdiobus_for_each_phy(_bus, _phydev) \ + for (_phydev = phy_find_first(_bus); _phydev; \ + _phydev = phy_find_next(_bus, _phydev)) + #if IS_ENABLED(CONFIG_PHYLIB) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 00b3e8480be7a49203594bd1fdb4fd46f3b69d59 Mon Sep 17 00:00:00 2001 From: Izhar Ameer Shaikh Date: Tue, 21 Oct 2025 17:00:01 +0530 Subject: scsi: firmware: xilinx: Add support for secure read/write ioctl interface Add support for a generic ioctl read/write interface using which users can request firmware to perform read/write operations on a protected and secure address space. The functionality is introduced through the means of two new IOCTL IDs which extend the existing PM_IOCTL EEMI API: - IOCTL_READ_REG - IOCTL_MASK_WRITE_REG The caller only passes the node id of the given device and an offset. The base address is not exposed to the caller and internally retrieved by the firmware. Firmware will enforce an access policy on the incoming read/write request. Signed-off-by: Izhar Ameer Shaikh Reviewed-by: Tanmay Shah Signed-off-by: Radhey Shyam Pandey Signed-off-by: Ajay Neeli Acked-by: Senthil Nathan Thangaraj Acked-by: Michal Simek Acked-by: Bart Van Assche Link: https://patch.msgid.link/20251021113003.13650-3-ajay.neeli@amd.com Signed-off-by: Martin K. Petersen --- drivers/firmware/xilinx/zynqmp.c | 46 ++++++++++++++++++++++++++++++++++++ include/linux/firmware/xlnx-zynqmp.h | 15 ++++++++++++ 2 files changed, 61 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 02da3e48bc8f..b7cd0eca9eaa 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -1616,6 +1616,52 @@ int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, return zynqmp_pm_invoke_fn(PM_IOCTL, payload, 3, 0, IOCTL_GET_FEATURE_CONFIG, id); } +/** + * zynqmp_pm_sec_read_reg - PM call to securely read from given offset + * of the node + * @node_id: Node Id of the device + * @offset: Offset to be used (20-bit) + * @ret_value: Output data read from the given offset after + * firmware access policy is successfully enforced + * + * Return: Returns 0 on success or error value on failure + */ +int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value) +{ + u32 ret_payload[PAYLOAD_ARG_CNT]; + u32 count = 1; + int ret; + + if (!ret_value) + return -EINVAL; + + ret = zynqmp_pm_invoke_fn(PM_IOCTL, ret_payload, 4, node_id, IOCTL_READ_REG, + offset, count); + + *ret_value = ret_payload[1]; + + return ret; +} +EXPORT_SYMBOL_GPL(zynqmp_pm_sec_read_reg); + +/** + * zynqmp_pm_sec_mask_write_reg - PM call to securely write to given offset + * of the node + * @node_id: Node Id of the device + * @offset: Offset to be used (20-bit) + * @mask: Mask to be used + * @value: Value to be written + * + * Return: Returns 0 on success or error value on failure + */ +int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, u32 mask, + u32 value) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 5, node_id, IOCTL_MASK_WRITE_REG, + offset, mask, value); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_sec_mask_write_reg); + /** * zynqmp_pm_set_sd_config - PM call to set value of SD config registers * @node: SD node ID diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index ae48d619c4e0..b161f37de5cc 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -241,6 +241,7 @@ enum pm_ioctl_id { IOCTL_GET_FEATURE_CONFIG = 27, /* IOCTL for Secure Read/Write Interface */ IOCTL_READ_REG = 28, + IOCTL_MASK_WRITE_REG = 29, /* Dynamic SD/GEM configuration */ IOCTL_SET_SD_CONFIG = 30, IOCTL_SET_GEM_CONFIG = 31, @@ -619,6 +620,9 @@ int zynqmp_pm_feature(const u32 api_id); int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id); int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value); int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload); +int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value); +int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, + u32 mask, u32 value); int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset); int zynqmp_pm_force_pwrdwn(const u32 target, const enum zynqmp_pm_request_ack ack); @@ -916,6 +920,17 @@ static inline int zynqmp_pm_request_wake(const u32 node, return -ENODEV; } +static inline int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, + u32 mask, u32 value) +{ + return -ENODEV; +} + static inline int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode) { return -ENODEV; -- cgit v1.2.3 From 0e4d26f79a74bc633846a27a9a20d52217c108dc Mon Sep 17 00:00:00 2001 From: Ajay Neeli Date: Tue, 21 Oct 2025 17:00:02 +0530 Subject: scsi: firmware: xilinx: Add APIs for UFS PHY initialization - Add APIs for UFS PHY initialization. - Verify M-PHY TX-RX configuration readiness. - Confirm SRAM initialization and Set SRAM bypass. - Retrieve UFS calibration values. Signed-off-by: Ajay Neeli Acked-by: Senthil Nathan Thangaraj Acked-by: Michal Simek Acked-by: Bart Van Assche Link: https://patch.msgid.link/20251021113003.13650-4-ajay.neeli@amd.com Signed-off-by: Martin K. Petersen --- drivers/firmware/xilinx/Makefile | 2 +- drivers/firmware/xilinx/zynqmp-ufs.c | 118 +++++++++++++++++++++++++++++++ include/linux/firmware/xlnx-zynqmp-ufs.h | 38 ++++++++++ include/linux/firmware/xlnx-zynqmp.h | 1 + 4 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 drivers/firmware/xilinx/zynqmp-ufs.c create mode 100644 include/linux/firmware/xlnx-zynqmp-ufs.h (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/Makefile b/drivers/firmware/xilinx/Makefile index 875a53703c82..70f8f02f14a3 100644 --- a/drivers/firmware/xilinx/Makefile +++ b/drivers/firmware/xilinx/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for Xilinx firmwares -obj-$(CONFIG_ZYNQMP_FIRMWARE) += zynqmp.o +obj-$(CONFIG_ZYNQMP_FIRMWARE) += zynqmp.o zynqmp-ufs.o obj-$(CONFIG_ZYNQMP_FIRMWARE_DEBUG) += zynqmp-debug.o diff --git a/drivers/firmware/xilinx/zynqmp-ufs.c b/drivers/firmware/xilinx/zynqmp-ufs.c new file mode 100644 index 000000000000..85da8a822f3a --- /dev/null +++ b/drivers/firmware/xilinx/zynqmp-ufs.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Firmware Layer for UFS APIs + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include +#include + +/* Register Node IDs */ +#define PM_REGNODE_PMC_IOU_SLCR 0x30000002 /* PMC IOU SLCR */ +#define PM_REGNODE_EFUSE_CACHE 0x30000003 /* EFUSE Cache */ + +/* Register Offsets for PMC IOU SLCR */ +#define SRAM_CSR_OFFSET 0x104C /* SRAM Control and Status */ +#define TXRX_CFGRDY_OFFSET 0x1054 /* M-PHY TX-RX Config ready */ + +/* Masks for SRAM Control and Status Register */ +#define SRAM_CSR_INIT_DONE_MASK BIT(0) /* SRAM initialization done */ +#define SRAM_CSR_EXT_LD_DONE_MASK BIT(1) /* SRAM External load done */ +#define SRAM_CSR_BYPASS_MASK BIT(2) /* Bypass SRAM interface */ + +/* Mask to check M-PHY TX-RX configuration readiness */ +#define TX_RX_CFG_RDY_MASK GENMASK(3, 0) + +/* Register Offsets for EFUSE Cache */ +#define UFS_CAL_1_OFFSET 0xBE8 /* UFS Calibration Value */ + +/** + * zynqmp_pm_is_mphy_tx_rx_config_ready - check M-PHY TX-RX config readiness + * @is_ready: Store output status (true/false) + * + * Return: Returns 0 on success or error value on failure. + */ +int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready) +{ + u32 regval; + int ret; + + if (!is_ready) + return -EINVAL; + + ret = zynqmp_pm_sec_read_reg(PM_REGNODE_PMC_IOU_SLCR, TXRX_CFGRDY_OFFSET, ®val); + if (ret) + return ret; + + regval &= TX_RX_CFG_RDY_MASK; + if (regval) + *is_ready = true; + else + *is_ready = false; + + return ret; +} +EXPORT_SYMBOL_GPL(zynqmp_pm_is_mphy_tx_rx_config_ready); + +/** + * zynqmp_pm_is_sram_init_done - check SRAM initialization + * @is_done: Store output status (true/false) + * + * Return: Returns 0 on success or error value on failure. + */ +int zynqmp_pm_is_sram_init_done(bool *is_done) +{ + u32 regval; + int ret; + + if (!is_done) + return -EINVAL; + + ret = zynqmp_pm_sec_read_reg(PM_REGNODE_PMC_IOU_SLCR, SRAM_CSR_OFFSET, ®val); + if (ret) + return ret; + + regval &= SRAM_CSR_INIT_DONE_MASK; + if (regval) + *is_done = true; + else + *is_done = false; + + return ret; +} +EXPORT_SYMBOL_GPL(zynqmp_pm_is_sram_init_done); + +/** + * zynqmp_pm_set_sram_bypass - Set SRAM bypass Control + * + * Return: Returns 0 on success or error value on failure. + */ +int zynqmp_pm_set_sram_bypass(void) +{ + u32 sram_csr; + int ret; + + ret = zynqmp_pm_sec_read_reg(PM_REGNODE_PMC_IOU_SLCR, SRAM_CSR_OFFSET, &sram_csr); + if (ret) + return ret; + + sram_csr &= ~SRAM_CSR_EXT_LD_DONE_MASK; + sram_csr |= SRAM_CSR_BYPASS_MASK; + + return zynqmp_pm_sec_mask_write_reg(PM_REGNODE_PMC_IOU_SLCR, SRAM_CSR_OFFSET, + GENMASK(2, 1), sram_csr); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_set_sram_bypass); + +/** + * zynqmp_pm_get_ufs_calibration_values - Read UFS calibration values + * @val: Store the calibration value + * + * Return: Returns 0 on success or error value on failure. + */ +int zynqmp_pm_get_ufs_calibration_values(u32 *val) +{ + return zynqmp_pm_sec_read_reg(PM_REGNODE_EFUSE_CACHE, UFS_CAL_1_OFFSET, val); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_get_ufs_calibration_values); diff --git a/include/linux/firmware/xlnx-zynqmp-ufs.h b/include/linux/firmware/xlnx-zynqmp-ufs.h new file mode 100644 index 000000000000..d3538dd5822a --- /dev/null +++ b/include/linux/firmware/xlnx-zynqmp-ufs.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Firmware layer for UFS APIs. + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + */ + +#ifndef __FIRMWARE_XLNX_ZYNQMP_UFS_H__ +#define __FIRMWARE_XLNX_ZYNQMP_UFS_H__ + +#if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) +int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready); +int zynqmp_pm_is_sram_init_done(bool *is_done); +int zynqmp_pm_set_sram_bypass(void); +int zynqmp_pm_get_ufs_calibration_values(u32 *val); +#else +static inline int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_is_sram_init_done(bool *is_done) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_set_sram_bypass(void) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_get_ufs_calibration_values(u32 *val) +{ + return -ENODEV; +} +#endif + +#endif /* __FIRMWARE_XLNX_ZYNQMP_UFS_H__ */ diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index b161f37de5cc..784d5920b4cd 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -16,6 +16,7 @@ #include #include +#include #define ZYNQMP_PM_VERSION_MAJOR 1 #define ZYNQMP_PM_VERSION_MINOR 0 -- cgit v1.2.3 From 39c89ee6e9c4464eb366f4e594379454a6c4db39 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 25 Oct 2025 21:53:18 +0100 Subject: compiler_types: Introduce __nocfi_generic There are two different ways that LLVM can expand kCFI operand bundles in LLVM IR: generically in the middle end or using an architecture specific sequence when lowering LLVM IR to machine code in the backend. The generic pass allows any architecture to take advantage of kCFI but the expansion of these bundles in the middle end can mess with optimizations that may turn indirect calls into direct calls when the call target is known at compile time, such as after inlining. Add __nocfi_generic, dependent on an architecture selecting CONFIG_ARCH_USES_CFI_GENERIC_LLVM_PASS, to disable kCFI bundle generation in functions where only the generic kCFI pass may cause problems. Link: https://github.com/ClangBuiltLinux/linux/issues/2124 Signed-off-by: Nathan Chancellor Link: https://patch.msgid.link/20251025-idpf-fix-arm-kcfi-build-error-v1-1-ec57221153ae@kernel.org Signed-off-by: Kees Cook --- arch/Kconfig | 7 +++++++ include/linux/compiler_types.h | 6 ++++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/arch/Kconfig b/arch/Kconfig index 74ff01133532..61130b88964b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -917,6 +917,13 @@ config ARCH_USES_CFI_TRAPS An architecture should select this option if it requires the .kcfi_traps section for KCFI trap handling. +config ARCH_USES_CFI_GENERIC_LLVM_PASS + bool + help + An architecture should select this option if it uses the generic + KCFIPass in LLVM to expand kCFI bundles instead of architecture-specific + lowering. + config CFI bool "Use Kernel Control Flow Integrity (kCFI)" default CFI_CLANG diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 59288a2c1ad2..1414be493738 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -461,6 +461,12 @@ struct ftrace_likely_data { # define __nocfi #endif +#if defined(CONFIG_ARCH_USES_CFI_GENERIC_LLVM_PASS) +# define __nocfi_generic __nocfi +#else +# define __nocfi_generic +#endif + /* * Any place that could be marked with the "alloc_size" attribute is also * a place to be marked with the "malloc" attribute, except those that may -- cgit v1.2.3 From c99d30706043481a1d631bbd9c7a4b70fe002a2b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 17 Oct 2025 21:31:01 -0700 Subject: byteorder: Add le64_to_cpu_array() and cpu_to_le64_array() Add le64_to_cpu_array() and cpu_to_le64_array(). These mirror the corresponding 32-bit functions. These will be used by the BLAKE2b code. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251018043106.375964-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/byteorder/generic.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index b3705e8bbe2b..55a44199de87 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -173,6 +173,22 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words) } } +static inline void le64_to_cpu_array(u64 *buf, unsigned int words) +{ + while (words--) { + __le64_to_cpus(buf); + buf++; + } +} + +static inline void cpu_to_le64_array(u64 *buf, unsigned int words) +{ + while (words--) { + __cpu_to_le64s(buf); + buf++; + } +} + static inline void memcpy_from_le32(u32 *dst, const __le32 *src, size_t words) { size_t i; -- cgit v1.2.3 From 74a7b4f18396f07e87c7fda5c19d1fcfb8c1dd44 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Oct 2025 00:08:02 -0700 Subject: sysctl: fix kernel-doc format warning Describe the "type" struct member using '@type' and move it together with the rest of the doc for ctl_table_header to avoid a kernel-doc warning: Warning: include/linux/sysctl.h:178 Incorrect use of kernel-doc format: * enum type - Enumeration to differentiate between ctl target types Fixes: 2f2665c13af4 ("sysctl: replace child with an enumeration") Signed-off-by: Randy Dunlap Signed-off-by: Joel Granados --- include/linux/sysctl.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 92e9146b1104..28c4a997fd21 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -156,6 +156,10 @@ struct ctl_node { * @nreg: When nreg drops to 0 the ctl_table_header will be unregistered. * @rcu: Delays the freeing of the inode. Introduced with "unfuck proc_sysctl ->d_compare()" * + * @type: Enumeration to differentiate between ctl target types + * @type.SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations + * @type.SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Identifies a permanently empty dir + * target to serve as a mount point */ struct ctl_table_header { union { @@ -175,13 +179,6 @@ struct ctl_table_header { struct ctl_dir *parent; struct ctl_node *node; struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */ - /** - * enum type - Enumeration to differentiate between ctl target types - * @SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations - * @SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Used to identify a permanently - * empty directory target to serve - * as mount point. - */ enum { SYSCTL_TABLE_TYPE_DEFAULT, SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY, -- cgit v1.2.3 From 3c0c81de525d2a2718e23754a5795483167904ac Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Tue, 11 Feb 2025 07:33:32 +0100 Subject: prandom: remove next_pseudo_random32 next_pseudo_random32 implements a LCG with known bad statistical properties and was only used in two pieces of testing code. With no remaining users now, remove it. Signed-off-by: Markus Theil Reviewed-by: Krzysztof Karas Signed-off-by: Jason A. Donenfeld --- include/linux/prandom.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index f2ed5b72b3d6..ff7dcc3fa105 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -47,10 +47,4 @@ static inline void prandom_seed_state(struct rnd_state *state, u64 seed) state->s4 = __seed(i, 128U); } -/* Pseudo random number generator from numerical recipes. */ -static inline u32 next_pseudo_random32(u32 seed) -{ - return seed * 1664525 + 1013904223; -} - #endif -- cgit v1.2.3 From 8e4ec90701efec7f2814c89b398d6d4272636814 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Oct 2025 07:55:55 -1000 Subject: freezer: Clarify that only cgroup1 freezer uses PM freezer cgroup1 freezer piggybacks on the PM freezer, which inadvertently allowed userspace to produce uninterruptible tasks at will. To avoid the issue, cgroup2 freezer switched to a separate job control based mechanism. While this happened a long time ago, the code and comment haven't been updated making it confusing to people who aren't familiar with the history. Rename cgroup_freezing() to cgroup1_freezing() and update comments on top of freezing() and frozen() to clarify that cgroup2 freezer isn't covered by the PM freezer mechanism. Signed-off-by: Tejun Heo Suggested-by: Qu Wenruo Link: https://patch.msgid.link/aPZ3q6Hm865NicBC@slm.duckdns.org Signed-off-by: Rafael J. Wysocki --- include/linux/freezer.h | 12 ++++++++---- kernel/cgroup/legacy_freezer.c | 2 +- kernel/freezer.c | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 32884c9721e5..0a8c6c4d1a82 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -22,14 +22,18 @@ extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ extern unsigned int freeze_timeout_msecs; /* - * Check if a process has been frozen + * Check if a process has been frozen for PM or cgroup1 freezer. Note that + * cgroup2 freezer uses the job control mechanism and does not interact with + * the PM freezer. */ extern bool frozen(struct task_struct *p); extern bool freezing_slow_path(struct task_struct *p); /* - * Check if there is a request to freeze a process + * Check if there is a request to freeze a task from PM or cgroup1 freezer. + * Note that cgroup2 freezer uses the job control mechanism and does not + * interact with the PM freezer. */ static inline bool freezing(struct task_struct *p) { @@ -63,9 +67,9 @@ extern bool freeze_task(struct task_struct *p); extern bool set_freezable(void); #ifdef CONFIG_CGROUP_FREEZER -extern bool cgroup_freezing(struct task_struct *task); +extern bool cgroup1_freezing(struct task_struct *task); #else /* !CONFIG_CGROUP_FREEZER */ -static inline bool cgroup_freezing(struct task_struct *task) +static inline bool cgroup1_freezing(struct task_struct *task) { return false; } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index dd9417425d92..915b02f65980 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -63,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer) return css_freezer(freezer->css.parent); } -bool cgroup_freezing(struct task_struct *task) +bool cgroup1_freezing(struct task_struct *task) { bool ret; diff --git a/kernel/freezer.c b/kernel/freezer.c index ddc11a8bd2ea..a76bf957fb32 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,7 +44,7 @@ bool freezing_slow_path(struct task_struct *p) if (tsk_is_oom_victim(p)) return false; - if (pm_nosig_freezing || cgroup_freezing(p)) + if (pm_nosig_freezing || cgroup1_freezing(p)) return true; if (pm_freezing && !(p->flags & PF_KTHREAD)) -- cgit v1.2.3 From c9822fad8038870bb690543539c8e9ad5213b12f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:14 +0100 Subject: libfs: allow to specify s_d_flags Make it possible for pseudo filesystems to specify default dentry flags. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-1-2e6f823ebdc0@kernel.org Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/libfs.c | 1 + include/linux/pseudo_fs.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/fs/libfs.c b/fs/libfs.c index ce8c496a6940..4bb4d8a313e7 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -680,6 +680,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) s->s_export_op = ctx->eops; s->s_xattr = ctx->xattr; s->s_time_gran = 1; + s->s_d_flags |= ctx->s_d_flags; root = new_inode(s); if (!root) return -ENOMEM; diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h index 2503f7625d65..a651e60d9410 100644 --- a/include/linux/pseudo_fs.h +++ b/include/linux/pseudo_fs.h @@ -9,6 +9,7 @@ struct pseudo_fs_context { const struct xattr_handler * const *xattr; const struct dentry_operations *dops; unsigned long magic; + unsigned int s_d_flags; }; struct pseudo_fs_context *init_pseudo(struct fs_context *fc, -- cgit v1.2.3 From 4511fd86db6f8f94f8aff01044f5c69aa38f81f4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 Oct 2025 18:08:09 +0100 Subject: filemap: Add folio_next_pos() Replace the open-coded implementation in ocfs2 (which loses the top 32 bits on 32-bit architectures) with a helper in pagemap.h. Fixes: 35edec1d52c0 (ocfs2: update truncate handling of partial clusters) Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251024170822.1427218-2-willy@infradead.org Reviewed-by: Joseph Qi Reviewed-by: Christoph Hellwig Cc: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Cc: ocfs2-devel@lists.linux.dev Signed-off-by: Christian Brauner --- fs/ocfs2/alloc.c | 2 +- include/linux/pagemap.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 162711cc5b20..b267ec580da9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6892,7 +6892,7 @@ static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start, ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1, &phys); - start = folio_next_index(folio) << PAGE_SHIFT; + start = folio_next_pos(folio); } out: if (folios) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..e16576e3763a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -941,6 +941,17 @@ static inline pgoff_t folio_next_index(const struct folio *folio) return folio->index + folio_nr_pages(folio); } +/** + * folio_next_pos - Get the file position of the next folio. + * @folio: The current folio. + * + * Return: The position of the folio which follows this folio in the file. + */ +static inline loff_t folio_next_pos(const struct folio *folio) +{ + return (loff_t)folio_next_index(folio) << PAGE_SHIFT; +} + /** * folio_file_page - The page for a particular index. * @folio: The folio which contains this index. -- cgit v1.2.3 From 4f6b0435c613fdb76d85bb4aae009309a8ce8784 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 29 Oct 2025 23:16:18 +0000 Subject: can: convert generic HW timestamp ioctl to ndo_hwtstamp callbacks Can has generic implementation of ndo_eth_ioctl which implements only HW timestamping commands. Implement generic ndo_hwtstamp callbacks and use it in drivers instead of generic ioctl interface. Signed-off-by: Vadim Fedorenko Reviewed-by: Kory Maincent Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20251029231620.1135640-2-vadim.fedorenko@linux.dev Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 45 +++++++++++----------- drivers/net/can/esd/esd_402_pci-core.c | 3 +- drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c | 3 +- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 3 +- drivers/net/can/usb/etas_es58x/es58x_core.c | 3 +- drivers/net/can/usb/gs_usb.c | 20 ++++++++-- drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c | 3 +- include/linux/can/dev.h | 6 ++- 8 files changed, 54 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 0cc3d008adb3..80e1ab18de87 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -379,34 +379,33 @@ int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) } EXPORT_SYMBOL_GPL(can_set_static_ctrlmode); -/* generic implementation of netdev_ops::ndo_eth_ioctl for CAN devices +/* generic implementation of netdev_ops::ndo_hwtstamp_get for CAN devices * supporting hardware timestamps */ -int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd) +int can_hwtstamp_get(struct net_device *netdev, + struct kernel_hwtstamp_config *cfg) { - struct hwtstamp_config hwts_cfg = { 0 }; - - switch (cmd) { - case SIOCSHWTSTAMP: /* set */ - if (copy_from_user(&hwts_cfg, ifr->ifr_data, sizeof(hwts_cfg))) - return -EFAULT; - if (hwts_cfg.tx_type == HWTSTAMP_TX_ON && - hwts_cfg.rx_filter == HWTSTAMP_FILTER_ALL) - return 0; - return -ERANGE; - - case SIOCGHWTSTAMP: /* get */ - hwts_cfg.tx_type = HWTSTAMP_TX_ON; - hwts_cfg.rx_filter = HWTSTAMP_FILTER_ALL; - if (copy_to_user(ifr->ifr_data, &hwts_cfg, sizeof(hwts_cfg))) - return -EFAULT; - return 0; + cfg->tx_type = HWTSTAMP_TX_ON; + cfg->rx_filter = HWTSTAMP_FILTER_ALL; - default: - return -EOPNOTSUPP; - } + return 0; +} +EXPORT_SYMBOL(can_hwtstamp_get); + +/* generic implementation of netdev_ops::ndo_hwtstamp_set for CAN devices + * supporting hardware timestamps + */ +int can_hwtstamp_set(struct net_device *netdev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + if (cfg->tx_type == HWTSTAMP_TX_ON && + cfg->rx_filter == HWTSTAMP_FILTER_ALL) + return 0; + NL_SET_ERR_MSG_MOD(extack, "Only TX on and RX all packets filter supported"); + return -ERANGE; } -EXPORT_SYMBOL(can_eth_ioctl_hwts); +EXPORT_SYMBOL(can_hwtstamp_set); /* generic implementation of ethtool_ops::get_ts_info for CAN devices * supporting hardware timestamps diff --git a/drivers/net/can/esd/esd_402_pci-core.c b/drivers/net/can/esd/esd_402_pci-core.c index 05adecae6375..c826f00c551b 100644 --- a/drivers/net/can/esd/esd_402_pci-core.c +++ b/drivers/net/can/esd/esd_402_pci-core.c @@ -86,7 +86,8 @@ static const struct net_device_ops pci402_acc_netdev_ops = { .ndo_open = acc_open, .ndo_stop = acc_close, .ndo_start_xmit = acc_start_xmit, - .ndo_eth_ioctl = can_eth_ioctl_hwts, + .ndo_hwtstamp_get = can_hwtstamp_get, + .ndo_hwtstamp_set = can_hwtstamp_set, }; static const struct ethtool_ops pci402_acc_ethtool_ops = { diff --git a/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c b/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c index 705f9bb74cd2..d8c9bfb20230 100644 --- a/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c +++ b/drivers/net/can/kvaser_pciefd/kvaser_pciefd_core.c @@ -902,8 +902,9 @@ static void kvaser_pciefd_bec_poll_timer(struct timer_list *data) static const struct net_device_ops kvaser_pciefd_netdev_ops = { .ndo_open = kvaser_pciefd_open, .ndo_stop = kvaser_pciefd_stop, - .ndo_eth_ioctl = can_eth_ioctl_hwts, .ndo_start_xmit = kvaser_pciefd_start_xmit, + .ndo_hwtstamp_get = can_hwtstamp_get, + .ndo_hwtstamp_set = can_hwtstamp_set, }; static int kvaser_pciefd_set_phys_id(struct net_device *netdev, diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 9402530ba3d4..c0f9d9fed02e 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1714,7 +1714,8 @@ static const struct net_device_ops mcp251xfd_netdev_ops = { .ndo_open = mcp251xfd_open, .ndo_stop = mcp251xfd_stop, .ndo_start_xmit = mcp251xfd_start_xmit, - .ndo_eth_ioctl = can_eth_ioctl_hwts, + .ndo_hwtstamp_get = can_hwtstamp_get, + .ndo_hwtstamp_set = can_hwtstamp_set, }; static void diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c index 47d9e03f3044..f799233c2b72 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.c +++ b/drivers/net/can/usb/etas_es58x/es58x_core.c @@ -1976,7 +1976,8 @@ static const struct net_device_ops es58x_netdev_ops = { .ndo_open = es58x_open, .ndo_stop = es58x_stop, .ndo_start_xmit = es58x_start_xmit, - .ndo_eth_ioctl = can_eth_ioctl_hwts, + .ndo_hwtstamp_get = can_hwtstamp_get, + .ndo_hwtstamp_set = can_hwtstamp_set, }; static const struct ethtool_ops es58x_ethtool_ops = { diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 30608901a974..1321eb5e89ae 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -1087,12 +1087,25 @@ static int gs_can_close(struct net_device *netdev) return 0; } -static int gs_can_eth_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) +static int gs_can_hwtstamp_get(struct net_device *netdev, + struct kernel_hwtstamp_config *cfg) { const struct gs_can *dev = netdev_priv(netdev); if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) - return can_eth_ioctl_hwts(netdev, ifr, cmd); + return can_hwtstamp_get(netdev, cfg); + + return -EOPNOTSUPP; +} + +static int gs_can_hwtstamp_set(struct net_device *netdev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + const struct gs_can *dev = netdev_priv(netdev); + + if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) + return can_hwtstamp_set(netdev, cfg, extack); return -EOPNOTSUPP; } @@ -1101,7 +1114,8 @@ static const struct net_device_ops gs_usb_netdev_ops = { .ndo_open = gs_can_open, .ndo_stop = gs_can_close, .ndo_start_xmit = gs_can_start_xmit, - .ndo_eth_ioctl = gs_can_eth_ioctl, + .ndo_hwtstamp_get = gs_can_hwtstamp_get, + .ndo_hwtstamp_set = gs_can_hwtstamp_set, }; static int gs_usb_set_identify(struct net_device *netdev, bool do_identify) diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index 89e22b66f919..62701ec34272 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -784,8 +784,9 @@ static int kvaser_usb_set_phys_id(struct net_device *netdev, static const struct net_device_ops kvaser_usb_netdev_ops = { .ndo_open = kvaser_usb_open, .ndo_stop = kvaser_usb_close, - .ndo_eth_ioctl = can_eth_ioctl_hwts, .ndo_start_xmit = kvaser_usb_start_xmit, + .ndo_hwtstamp_get = can_hwtstamp_get, + .ndo_hwtstamp_set = can_hwtstamp_set, }; static const struct ethtool_ops kvaser_usb_ethtool_ops = { diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 0fe8f80f223e..bd7410b5d8a6 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -129,7 +129,11 @@ void close_candev(struct net_device *dev); void can_set_default_mtu(struct net_device *dev); int __must_check can_set_static_ctrlmode(struct net_device *dev, u32 static_mode); -int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd); +int can_hwtstamp_get(struct net_device *netdev, + struct kernel_hwtstamp_config *cfg); +int can_hwtstamp_set(struct net_device *netdev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); int can_ethtool_op_get_ts_info_hwts(struct net_device *dev, struct kernel_ethtool_ts_info *info); -- cgit v1.2.3 From 0de4c70d04a46a3c266547dd4275ce25f623796a Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 25 Sep 2025 09:56:47 +0900 Subject: tracing: fprobe: use rhltable for fprobe_ip_table For now, all the kernel functions who are hooked by the fprobe will be added to the hash table "fprobe_ip_table". The key of it is the function address, and the value of it is "struct fprobe_hlist_node". The budget of the hash table is FPROBE_IP_TABLE_SIZE, which is 256. And this means the overhead of the hash table lookup will grow linearly if the count of the functions in the fprobe more than 256. When we try to hook all the kernel functions, the overhead will be huge. Therefore, replace the hash table with rhltable to reduce the overhead. Link: https://lore.kernel.org/all/20250819031825.55653-1-dongml2@chinatelecom.cn/ Signed-off-by: Menglong Dong Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 3 +- kernel/trace/fprobe.c | 157 ++++++++++++++++++++++++++++--------------------- 2 files changed, 93 insertions(+), 67 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 7964db96e41a..0a3bcd1718f3 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -7,6 +7,7 @@ #include #include #include +#include #include struct fprobe; @@ -26,7 +27,7 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, * @fp: The fprobe which owns this. */ struct fprobe_hlist_node { - struct hlist_node hlist; + struct rhlist_head hlist; unsigned long addr; struct fprobe *fp; }; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 5a807d62e76d..e063e22e1134 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -41,60 +42,67 @@ * - RCU hlist traversal under disabling preempt */ static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; -static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE]; +static struct rhltable fprobe_ip_table; static DEFINE_MUTEX(fprobe_mutex); -/* - * Find first fprobe in the hlist. It will be iterated twice in the entry - * probe, once for correcting the total required size, the second time is - * calling back the user handlers. - * Thus the hlist in the fprobe_table must be sorted and new probe needs to - * be added *before* the first fprobe. - */ -static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip) +static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed) { - struct fprobe_hlist_node *node; - struct hlist_head *head; + return hash_ptr(*(unsigned long **)data, 32); +} - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; - hlist_for_each_entry_rcu(node, head, hlist, - lockdep_is_held(&fprobe_mutex)) { - if (node->addr == ip) - return node; - } - return NULL; +static int fprobe_node_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + unsigned long key = *(unsigned long *)arg->key; + const struct fprobe_hlist_node *n = ptr; + + return n->addr != key; } -NOKPROBE_SYMBOL(find_first_fprobe_node); -/* Node insertion and deletion requires the fprobe_mutex */ -static void insert_fprobe_node(struct fprobe_hlist_node *node) +static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed) { - unsigned long ip = node->addr; - struct fprobe_hlist_node *next; - struct hlist_head *head; + const struct fprobe_hlist_node *n = data; + + return hash_ptr((void *)n->addr, 32); +} +static const struct rhashtable_params fprobe_rht_params = { + .head_offset = offsetof(struct fprobe_hlist_node, hlist), + .key_offset = offsetof(struct fprobe_hlist_node, addr), + .key_len = sizeof_field(struct fprobe_hlist_node, addr), + .hashfn = fprobe_node_hashfn, + .obj_hashfn = fprobe_node_obj_hashfn, + .obj_cmpfn = fprobe_node_cmp, + .automatic_shrinking = true, +}; + +/* Node insertion and deletion requires the fprobe_mutex */ +static int insert_fprobe_node(struct fprobe_hlist_node *node) +{ lockdep_assert_held(&fprobe_mutex); - next = find_first_fprobe_node(ip); - if (next) { - hlist_add_before_rcu(&node->hlist, &next->hlist); - return; - } - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; - hlist_add_head_rcu(&node->hlist, head); + return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); } /* Return true if there are synonims */ static bool delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); + bool ret; /* Avoid double deleting */ if (READ_ONCE(node->fp) != NULL) { WRITE_ONCE(node->fp, NULL); - hlist_del_rcu(&node->hlist); + rhltable_remove(&fprobe_ip_table, &node->hlist, + fprobe_rht_params); } - return !!find_first_fprobe_node(node->addr); + + rcu_read_lock(); + ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr, + fprobe_rht_params); + rcu_read_unlock(); + + return ret; } /* Check existence of the fprobe */ @@ -249,9 +257,10 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, struct ftrace_regs *fregs) { - struct fprobe_hlist_node *node, *first; unsigned long *fgraph_data = NULL; unsigned long func = trace->func; + struct fprobe_hlist_node *node; + struct rhlist_head *head, *pos; unsigned long ret_ip; int reserved_words; struct fprobe *fp; @@ -260,14 +269,11 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, if (WARN_ON_ONCE(!fregs)) return 0; - first = node = find_first_fprobe_node(func); - if (unlikely(!first)) - return 0; - + head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params); reserved_words = 0; - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (!fp || !fp->exit_handler) continue; @@ -278,13 +284,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, reserved_words += FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size); } - node = first; if (reserved_words) { fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long)); if (unlikely(!fgraph_data)) { - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (fp && !fprobe_disabled(fp)) fp->nmissed++; @@ -299,12 +304,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, */ ret_ip = ftrace_regs_get_return_address(fregs); used = 0; - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { int data_size; void *data; if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (!fp || fprobe_disabled(fp)) continue; @@ -449,25 +454,21 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad return 0; } -static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head, - struct fprobe_addr_list *alist) +static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, + struct fprobe_addr_list *alist) { - struct fprobe_hlist_node *node; int ret = 0; - hlist_for_each_entry_rcu(node, head, hlist, - lockdep_is_held(&fprobe_mutex)) { - if (!within_module(node->addr, mod)) - continue; - if (delete_fprobe_node(node)) - continue; - /* - * If failed to update alist, just continue to update hlist. - * Therefore, at list user handler will not hit anymore. - */ - if (!ret) - ret = fprobe_addr_list_add(alist, node->addr); - } + if (!within_module(node->addr, mod)) + return; + if (delete_fprobe_node(node)) + return; + /* + * If failed to update alist, just continue to update hlist. + * Therefore, at list user handler will not hit anymore. + */ + if (!ret) + ret = fprobe_addr_list_add(alist, node->addr); } /* Handle module unloading to manage fprobe_ip_table. */ @@ -475,8 +476,9 @@ static int fprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; + struct fprobe_hlist_node *node; + struct rhashtable_iter iter; struct module *mod = data; - int i; if (val != MODULE_STATE_GOING) return NOTIFY_DONE; @@ -487,8 +489,16 @@ static int fprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; mutex_lock(&fprobe_mutex); - for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) - fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); + rhltable_walk_enter(&fprobe_ip_table, &iter); + do { + rhashtable_walk_start(&iter); + + while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node)) + fprobe_remove_node_in_module(mod, node, &alist); + + rhashtable_walk_stop(&iter); + } while (node == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); if (alist.index > 0) ftrace_set_filter_ips(&fprobe_graph_ops.ops, @@ -728,8 +738,16 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) ret = fprobe_graph_add_ips(addrs, num); if (!ret) { add_fprobe_hash(fp); - for (i = 0; i < hlist_array->size; i++) - insert_fprobe_node(&hlist_array->array[i]); + for (i = 0; i < hlist_array->size; i++) { + ret = insert_fprobe_node(&hlist_array->array[i]); + if (ret) + break; + } + /* fallback on insert error */ + if (ret) { + for (i--; i >= 0; i--) + delete_fprobe_node(&hlist_array->array[i]); + } } mutex_unlock(&fprobe_mutex); @@ -825,3 +843,10 @@ out: return ret; } EXPORT_SYMBOL_GPL(unregister_fprobe); + +static int __init fprobe_initcall(void) +{ + rhltable_init(&fprobe_ip_table, &fprobe_rht_params); + return 0; +} +late_initcall(fprobe_initcall); -- cgit v1.2.3 From 14a7f2392f42bbb71c1a5ea68930006221fcd80a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 29 Oct 2025 11:36:46 -0700 Subject: bpf: Make migrate_disable always inline to avoid partial inlining The build fails with llvm 21/22: $ make LLVM=1 -j ... LD vmlinux.o GEN .vmlinux.objs ... BTF .tmp_vmlinux1.btf.o ... AS .tmp_vmlinux2.kallsyms.o LD vmlinux.unstripped BTFIDS vmlinux.unstripped WARN: resolve_btfids: unresolved symbol migrate_enable WARN: resolve_btfids: unresolved symbol migrate_disable make[2]: *** [vmlinux.unstripped] Error 255 make[2]: *** Deleting file 'vmlinux.unstripped' make[1]: *** [Makefile:1242: vmlinux] Error 2 make: *** [Makefile:248: __sub-make] Error 2 Two functions with identical names but different addresses are considered ambiguous and removed by "pahole" from vmlinux BTF. Later resolve_btfids warns since it cannot find them. Commit 378b7708194f ("sched: Make migrate_{en,dis}able() inline") made them inlineable in most places, but in vmlinux built with llvm 21 and 22 there are four symbols for migrate_{enable,disable}: three static functions and one global function. Fix the issue by marking migrate_{enable,disable} as always inline. The alternative is to mark them as notrace/nokprobe which is more drastic. Only bpf programs are prevented from attaching to these functions. The rest of the tracing shouldn't be affected. [note: Peter ok-ed the patch, Alexei rewrote commit log] Fixes: 378b7708194f ("sched: Make migrate_{en,dis}able() inline") Signed-off-by: Yonghong Song Acked-by: Menglong Dong Link: https://lore.kernel.org/r/20251029183646.3811774-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..b469878de25c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2407,12 +2407,12 @@ static inline void __migrate_enable(void) { } * be defined in kernel/sched/core.c. */ #ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE -static inline void migrate_disable(void) +static __always_inline void migrate_disable(void) { __migrate_disable(); } -static inline void migrate_enable(void) +static __always_inline void migrate_enable(void) { __migrate_enable(); } -- cgit v1.2.3 From 68c4c159a0db4409a5d6b5f4703d71b89a96f06a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Oct 2025 14:30:32 +0000 Subject: genirq: Fix percpu_devid irq affinity documentation Stephen points out that some of the percpu_devid irq affinity documentation is either missing or not matching the data structures. Address all the issues in one go. Fixes: 87b0031f7f73 ("irqdomain: Add firmware info reporting interface") Fixes: 258e7d28a3dc ("genirq: Add affinity to percpu_devid interrupt requests") Reported-by: Stephen Rothwell Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251030143032.2035987-1-maz@kernel.org --- include/linux/interrupt.h | 1 + include/linux/irqdomain.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index fa62ab556ee3..266f2b39213a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -109,6 +109,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); * @name: name of the device * @dev_id: cookie to identify the device * @percpu_dev_id: cookie to identify the device + * @affinity: CPUs this irqaction is allowed to run on * @next: pointer to the next irqaction for shared interrupts * @irq: interrupt number * @flags: flags (see IRQF_* above) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 5907baf6099d..952d3c8dd6b7 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -48,7 +48,7 @@ struct irq_fwspec { * struct irq_fwspec_info - firmware provided IRQ information structure * * @flags: Information validity flags - * @cpumask: Affinity mask for this interrupt + * @affinity: Affinity mask for this interrupt * * This structure reports firmware-specific information about an * interrupt. The only significant information is the affinity of a -- cgit v1.2.3 From 30176bf7c871681df506f3165ffe76ec462db991 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 29 Oct 2025 16:32:06 +0100 Subject: dpll: add phase-adjust-gran pin attribute Phase-adjust values are currently limited by a min-max range. Some hardware requires, for certain pin types, that values be multiples of a specific granularity, as in the zl3073x driver. Add a `phase-adjust-gran` pin attribute and an appropriate field in dpll_pin_properties. If set by the driver, use its value to validate user-provided phase-adjust values. Reviewed-by: Michal Schmidt Reviewed-by: Petr Oros Tested-by: Prathosh Satish Signed-off-by: Ivan Vecera Reviewed-by: Jiri Pirko Reviewed-by: Arkadiusz Kubalewski Link: https://patch.msgid.link/20251029153207.178448-2-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/driver-api/dpll.rst | 36 +++++++++++++++++++---------------- Documentation/netlink/specs/dpll.yaml | 7 +++++++ drivers/dpll/dpll_netlink.c | 12 +++++++++++- include/linux/dpll.h | 1 + include/uapi/linux/dpll.h | 1 + 5 files changed, 40 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index be1fc643b645..83118c728ed9 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -198,26 +198,28 @@ be requested with the same attribute with ``DPLL_CMD_DEVICE_SET`` command. ================================== ====================================== Device may also provide ability to adjust a signal phase on a pin. -If pin phase adjustment is supported, minimal and maximal values that pin -handle shall be provide to the user on ``DPLL_CMD_PIN_GET`` respond -with ``DPLL_A_PIN_PHASE_ADJUST_MIN`` and ``DPLL_A_PIN_PHASE_ADJUST_MAX`` +If pin phase adjustment is supported, minimal and maximal values and +granularity that pin handle shall be provided to the user on +``DPLL_CMD_PIN_GET`` respond with ``DPLL_A_PIN_PHASE_ADJUST_MIN``, +``DPLL_A_PIN_PHASE_ADJUST_MAX`` and ``DPLL_A_PIN_PHASE_ADJUST_GRAN`` attributes. Configured phase adjust value is provided with ``DPLL_A_PIN_PHASE_ADJUST`` attribute of a pin, and value change can be requested with the same attribute with ``DPLL_CMD_PIN_SET`` command. - =============================== ====================================== - ``DPLL_A_PIN_ID`` configured pin id - ``DPLL_A_PIN_PHASE_ADJUST_MIN`` attr minimum value of phase adjustment - ``DPLL_A_PIN_PHASE_ADJUST_MAX`` attr maximum value of phase adjustment - ``DPLL_A_PIN_PHASE_ADJUST`` attr configured value of phase - adjustment on parent dpll device - ``DPLL_A_PIN_PARENT_DEVICE`` nested attribute for requesting - configuration on given parent dpll - device - ``DPLL_A_PIN_PARENT_ID`` parent dpll device id - ``DPLL_A_PIN_PHASE_OFFSET`` attr measured phase difference - between a pin and parent dpll device - =============================== ====================================== + ================================ ========================================== + ``DPLL_A_PIN_ID`` configured pin id + ``DPLL_A_PIN_PHASE_ADJUST_GRAN`` attr granularity of phase adjustment value + ``DPLL_A_PIN_PHASE_ADJUST_MIN`` attr minimum value of phase adjustment + ``DPLL_A_PIN_PHASE_ADJUST_MAX`` attr maximum value of phase adjustment + ``DPLL_A_PIN_PHASE_ADJUST`` attr configured value of phase + adjustment on parent dpll device + ``DPLL_A_PIN_PARENT_DEVICE`` nested attribute for requesting + configuration on given parent dpll + device + ``DPLL_A_PIN_PARENT_ID`` parent dpll device id + ``DPLL_A_PIN_PHASE_OFFSET`` attr measured phase difference + between a pin and parent dpll device + ================================ ========================================== All phase related values are provided in pico seconds, which represents time difference between signals phase. The negative value means that @@ -384,6 +386,8 @@ according to attribute purpose. frequencies ``DPLL_A_PIN_ANY_FREQUENCY_MIN`` attr minimum value of frequency ``DPLL_A_PIN_ANY_FREQUENCY_MAX`` attr maximum value of frequency + ``DPLL_A_PIN_PHASE_ADJUST_GRAN`` attr granularity of phase + adjustment value ``DPLL_A_PIN_PHASE_ADJUST_MIN`` attr minimum value of phase adjustment ``DPLL_A_PIN_PHASE_ADJUST_MAX`` attr maximum value of phase diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index 80728f6f9bc8..78d0724d7e12 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -440,6 +440,12 @@ attribute-sets: doc: | Capable pin provides list of pins that can be bound to create a reference-sync pin pair. + - + name: phase-adjust-gran + type: u32 + doc: | + Granularity of phase adjustment, in picoseconds. The value of + phase adjustment must be a multiple of this granularity. - name: pin-parent-device @@ -616,6 +622,7 @@ operations: - capabilities - parent-device - parent-pin + - phase-adjust-gran - phase-adjust-min - phase-adjust-max - phase-adjust diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index a4153bcb6dcf..64944f601ee5 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -637,6 +637,10 @@ dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin, ret = dpll_msg_add_pin_freq(msg, pin, ref, extack); if (ret) return ret; + if (prop->phase_gran && + nla_put_u32(msg, DPLL_A_PIN_PHASE_ADJUST_GRAN, + prop->phase_gran)) + return -EMSGSIZE; if (nla_put_s32(msg, DPLL_A_PIN_PHASE_ADJUST_MIN, prop->phase_range.min)) return -EMSGSIZE; @@ -1261,7 +1265,13 @@ dpll_pin_phase_adj_set(struct dpll_pin *pin, struct nlattr *phase_adj_attr, if (phase_adj > pin->prop.phase_range.max || phase_adj < pin->prop.phase_range.min) { NL_SET_ERR_MSG_ATTR(extack, phase_adj_attr, - "phase adjust value not supported"); + "phase adjust value of out range"); + return -EINVAL; + } + if (pin->prop.phase_gran && phase_adj % (s32)pin->prop.phase_gran) { + NL_SET_ERR_MSG_ATTR_FMT(extack, phase_adj_attr, + "phase adjust value not multiple of %u", + pin->prop.phase_gran); return -EINVAL; } diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 25be745bf41f..562f520b23c2 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -163,6 +163,7 @@ struct dpll_pin_properties { u32 freq_supported_num; struct dpll_pin_frequency *freq_supported; struct dpll_pin_phase_adjust_range phase_range; + u32 phase_gran; }; #if IS_ENABLED(CONFIG_DPLL) diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index ab1725a954d7..69d35570ac4f 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -251,6 +251,7 @@ enum dpll_a_pin { DPLL_A_PIN_ESYNC_FREQUENCY_SUPPORTED, DPLL_A_PIN_ESYNC_PULSE, DPLL_A_PIN_REFERENCE_SYNC, + DPLL_A_PIN_PHASE_ADJUST_GRAN, __DPLL_A_PIN_MAX, DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1) -- cgit v1.2.3 From 652a86b24c5ac444afaf7625c9340d55aab7f105 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Fri, 31 Oct 2025 14:08:32 +0100 Subject: err.h: add INIT_ERR_PTR() macro Add INIT_ERR_PTR() macro to initialize static variables with error pointers. This might be useful for specific case where there is a static variable initialized to an error condition and then later set to the real handle once probe finish/completes. This is to handle compilation problems like: error: initializer element is not constant where ERR_PTR() can't be used. Signed-off-by: Christian Marangi Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20251031130835.7953-2-ansuelsmth@gmail.com [bjorn: Added () suffix on macro references] Signed-off-by: Bjorn Andersson --- include/linux/err.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/err.h b/include/linux/err.h index 1d60aa86db53..8c37be0620ab 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -41,6 +41,14 @@ static inline void * __must_check ERR_PTR(long error) return (void *) error; } +/** + * INIT_ERR_PTR - Init a const error pointer. + * @error: A negative error code. + * + * Like ERR_PTR(), but usable to initialize static variables. + */ +#define INIT_ERR_PTR(error) ((void *)(error)) + /* Return the pointer in the percpu address space. */ #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) -- cgit v1.2.3 From 51d0656959bcdb743232f9b530b4cca569e74e7f Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 27 Oct 2025 13:59:31 +0100 Subject: genirq/manage: Reduce priority of forced secondary interrupt handler Crystal reports that the PCIe Advanced Error Reporting driver gets stuck in an infinite loop on PREEMPT_RT: Both the primary interrupt handler aer_irq() as well as the secondary handler aer_isr() are forced into threads with identical priority. Crystal writes that on the ARM system in question, the primary handler has to clear an error in the Root Error Status register... "before the next error happens, or else the hardware will set the Multiple ERR_COR Received bit. If that bit is set, then aer_isr() can't rely on the Error Source Identification register, so it scans through all devices looking for errors -- and for some reason, on this system, accessing the AER registers (or any Config Space above 0x400, even though there are capabilities located there) generates an Unsupported Request Error (but returns valid data). Since this happens more than once, without aer_irq() preempting, it causes another multi error and we get stuck in a loop." The issue does not show on non-PREEMPT_RT because the primary handler runs in hardirq context and thus can preempt the threaded secondary handler, clear the Root Error Status register and prevent the secondary handler from getting stuck. Emulate the same behavior on PREEMPT_RT by assigning a lower default priority to the secondary handler if the primary handler is forced into a thread. Reported-by: Crystal Wood Signed-off-by: Lukas Wunner Signed-off-by: Thomas Gleixner Tested-by: Crystal Wood Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/f6dcdb41be2694886b8dbf4fe7b3ab89e9d5114c.1761569303.git.lukas@wunner.de Closes: https://lore.kernel.org/r/20250902224441.368483-1-crwood@redhat.com/ --- include/linux/sched.h | 1 + kernel/irq/manage.c | 5 ++++- kernel/sched/syscalls.c | 13 +++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..cd6be74d87b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1901,6 +1901,7 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_para extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); +extern void sched_set_fifo_secondary(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7a09d96f4d2d..c812b6f48f2b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1239,7 +1239,10 @@ static int irq_thread(void *data) irq_thread_set_ready(desc, action); - sched_set_fifo(current); + if (action->handler == irq_forced_secondary_handler) + sched_set_fifo_secondary(current); + else + sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 77ae87f36e84..48347950ac48 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -856,6 +856,19 @@ void sched_set_fifo_low(struct task_struct *p) } EXPORT_SYMBOL_GPL(sched_set_fifo_low); +/* + * Used when the primary interrupt handler is forced into a thread, in addition + * to the (always threaded) secondary handler. The secondary handler gets a + * slightly lower priority so that the primary handler can preempt it, thereby + * emulating the behavior of a non-PREEMPT_RT system where the primary handler + * runs in hard interrupt context. + */ +void sched_set_fifo_secondary(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} + void sched_set_normal(struct task_struct *p, int nice) { struct sched_attr attr = { -- cgit v1.2.3 From 933ecf591275e850a46b28c6016d2688b92e23f6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 1 Nov 2025 21:19:41 -0700 Subject: random: remove unused get_random_var_wait functions None of these functions are used, so remove them. This renders the two bugs moot: - get_random_u64_wait() used the wrong pointer type, making it provide only 32 bits. - The '#undef' directive used the wrong identifier, leaving the helper macro defined. Signed-off-by: Eric Biggers Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 333cecfca93f..8a8064dc3970 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -130,21 +130,6 @@ static inline int get_random_bytes_wait(void *buf, size_t nbytes) return ret; } -#define declare_get_random_var_wait(name, ret_type) \ - static inline int get_random_ ## name ## _wait(ret_type *out) { \ - int ret = wait_for_random_bytes(); \ - if (unlikely(ret)) \ - return ret; \ - *out = get_random_ ## name(); \ - return 0; \ - } -declare_get_random_var_wait(u8, u8) -declare_get_random_var_wait(u16, u16) -declare_get_random_var_wait(u32, u32) -declare_get_random_var_wait(u64, u32) -declare_get_random_var_wait(long, unsigned long) -#undef declare_get_random_var - #ifdef CONFIG_SMP int random_prepare_cpu(unsigned int cpu); int random_online_cpu(unsigned int cpu); -- cgit v1.2.3 From 8ed6b8842c44a4a716dfd536e7f13aff77039a02 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 25 Sep 2025 22:09:56 +0300 Subject: power: supply: max77705_charger: implement aicl feature Adaptive input current allows charger to reduce it's current consumption, when source is not able to provide enough power. Signed-off-by: Dzmitry Sankouski Link: https://patch.msgid.link/20250925-max77705_77976_charger_improvement-v6-1-972c716c17d1@gmail.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 42 +++++++++++++++++++++++++++++++++ include/linux/power/max77705_charger.h | 2 ++ 2 files changed, 44 insertions(+) (limited to 'include/linux') diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index b1a227bf72e2..35cdb10a0e89 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -40,6 +40,39 @@ static enum power_supply_property max77705_charger_props[] = { POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, }; +static irqreturn_t max77705_aicl_irq(int irq, void *irq_drv_data) +{ + struct max77705_charger_data *chg = irq_drv_data; + unsigned int regval, irq_status; + int err; + + err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status); + if (err < 0) + return IRQ_HANDLED; + + // irq is fiered at the end of current decrease sequence too + // early check AICL_I bit to guard against that excess irq call + while (!(irq_status & BIT(MAX77705_AICL_I))) { + err = regmap_field_read(chg->rfield[MAX77705_CHG_CHGIN_LIM], ®val); + if (err < 0) + return IRQ_HANDLED; + + regval--; + + err = regmap_field_write(chg->rfield[MAX77705_CHG_CHGIN_LIM], regval); + if (err < 0) + return IRQ_HANDLED; + + msleep(AICL_WORK_DELAY_MS); + + err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status); + if (err < 0) + return IRQ_HANDLED; + } + + return IRQ_HANDLED; +} + static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data) { struct max77705_charger_data *chg = irq_drv_data; @@ -632,6 +665,15 @@ static int max77705_charger_probe(struct i2c_client *i2c) goto destroy_wq; } + ret = devm_request_threaded_irq(dev, regmap_irq_get_virq(irq_data, MAX77705_AICL_I), + NULL, max77705_aicl_irq, + IRQF_TRIGGER_NONE, + "aicl-irq", chg); + if (ret) { + dev_err_probe(dev, ret, "Failed to Request aicl IRQ\n"); + goto destroy_wq; + } + ret = max77705_charger_enable(chg); if (ret) { dev_err_probe(dev, ret, "failed to enable charge\n"); diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h index 6653abfdf747..b3950ce0625e 100644 --- a/include/linux/power/max77705_charger.h +++ b/include/linux/power/max77705_charger.h @@ -123,6 +123,8 @@ #define MAX77705_DISABLE_SKIP 1 #define MAX77705_AUTO_SKIP 0 +#define AICL_WORK_DELAY_MS 100 + /* uA */ #define MAX77705_CURRENT_CHGIN_STEP 25000 #define MAX77705_CURRENT_CHG_STEP 50000 -- cgit v1.2.3 From 695f2ca1b4247724576d57eae7b74b90dc69ba3c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 10 Oct 2025 16:36:16 -0500 Subject: fs/fs_parse: add back fsparam_u32hex 296b67059 removed fsparam_u32hex because there were no callers (yet) and it didn't build due to using the nonexistent symbol fs_param_is_u32_hex. fs/9p will need this parser, so add it back with the appropriate fix (use fs_param_is_u32). Signed-off-by: Eric Sandeen Message-ID: <20251010214222.1347785-2-sandeen@redhat.com> Signed-off-by: Dominique Martinet --- include/linux/fs_parser.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 5a0e897cae80..5e8a3b546033 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -120,6 +120,8 @@ static inline bool fs_validate_description(const char *name, #define fsparam_u32(NAME, OPT) __fsparam(fs_param_is_u32, NAME, OPT, 0, NULL) #define fsparam_u32oct(NAME, OPT) \ __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8) +#define fsparam_u32hex(NAME, OPT) \ + __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)16) #define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL) #define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL) #define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array) -- cgit v1.2.3 From c95de73da12bf4586b7bcd6b23a6968c21991cc7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 30 Oct 2025 22:41:18 -0700 Subject: mtd: spear_smi: fix kernel-doc warnings Correct most kernel-doc warnings in include/linux/mtd/spear_smi.h by adding a leading '@' to the description of struct members. Add a new description for the missing @np member. Warning: spear_smi.h:48 struct member 'name' not described in 'spear_smi_flash_info' Warning: spear_smi.h:48 struct member 'mem_base' not described in 'spear_smi_flash_info' Warning: spear_smi.h:48 struct member 'size' not described in 'spear_smi_flash_info' Warning: spear_smi.h:48 struct member 'partitions' not described in 'spear_smi_flash_info' Warning: spear_smi.h:48 struct member 'nr_partitions' not described in 'spear_smi_flash_info' Warning: spear_smi.h:48 struct member 'fast_mode' not described in 'spear_smi_flash_info' Warning: spear_smi.h:62 struct member 'clk_rate' not described in 'spear_smi_plat_data' Warning: spear_smi.h:62 struct member 'num_flashes' not described in 'spear_smi_plat_data' Warning: spear_smi.h:62 struct member 'board_flash_info' not described in 'spear_smi_plat_data' Warning: spear_smi.h:62 struct member 'np' not described in 'spear_smi_plat_data' Signed-off-by: Randy Dunlap Signed-off-by: Miquel Raynal --- include/linux/mtd/spear_smi.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spear_smi.h b/include/linux/mtd/spear_smi.h index 581603ac1277..871634862627 100644 --- a/include/linux/mtd/spear_smi.h +++ b/include/linux/mtd/spear_smi.h @@ -31,12 +31,12 @@ * struct spear_smi_flash_info - platform structure for passing flash * information * - * name: name of the serial nor flash for identification - * mem_base: the memory base on which the flash is mapped - * size: size of the flash in bytes - * partitions: parition details - * nr_partitions: number of partitions - * fast_mode: whether flash supports fast mode + * @name: name of the serial nor flash for identification + * @mem_base: the memory base on which the flash is mapped + * @size: size of the flash in bytes + * @partitions: parition details + * @nr_partitions: number of partitions + * @fast_mode: whether flash supports fast mode */ struct spear_smi_flash_info { @@ -51,9 +51,10 @@ struct spear_smi_flash_info { /** * struct spear_smi_plat_data - platform structure for configuring smi * - * clk_rate: clk rate at which SMI must operate - * num_flashes: number of flashes present on board - * board_flash_info: specific details of each flash present on board + * @clk_rate: clk rate at which SMI must operate + * @num_flashes: number of flashes present on board + * @board_flash_info: specific details of each flash present on board + * @np: array of DT node pointers for all possible flash chip devices */ struct spear_smi_plat_data { unsigned long clk_rate; -- cgit v1.2.3 From c3d78c34ad009a7cce57ae5b5c93e1bd03bb31a3 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 22 Sep 2025 11:30:10 +0800 Subject: perf: arm_pmuv3: Don't use PMCCNTR_EL0 on SMT cores CPU_CYCLES is expected to count the logical CPU (PE) clock. Currently it's preferred to use PMCCNTR_EL0 for counting CPU_CYCLES, but it'll count processor clock rather than the PE clock (ARM DDI0487 L.b D13.1.3) if one of the SMT siblings is not idle on a multi-threaded implementation. So don't use it on SMT cores. Introduce topology_core_has_smt() for knowing the SMT implementation and cached it in arm_pmu::has_smt during allocation. When counting cycles on SMT CPU 2-3 and CPU 3 is idle, without this patch we'll get: [root@client1 tmp]# perf stat -e cycles -A -C 2-3 -- stress-ng -c 1 --taskset 2 --timeout 1 [...] Performance counter stats for 'CPU(s) 2-3': CPU2 2880457316 cycles CPU3 2880459810 cycles 1.254688470 seconds time elapsed With this patch the idle state of CPU3 is observed as expected: [root@client1 ~]# perf stat -e cycles -A -C 2-3 -- stress-ng -c 1 --taskset 2 --timeout 1 [...] Performance counter stats for 'CPU(s) 2-3': CPU2 2558580492 cycles CPU3 305749 cycles 1.113626410 seconds time elapsed Signed-off-by: Yicong Yang Signed-off-by: Will Deacon --- drivers/perf/arm_pmu.c | 6 ++++++ drivers/perf/arm_pmuv3.c | 10 ++++++++++ include/linux/arch_topology.h | 11 +++++++++++ include/linux/perf/arm_pmu.h | 1 + 4 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 5c310e803dd7..ae437791b5f8 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -925,6 +925,12 @@ int armpmu_register(struct arm_pmu *pmu) if (ret) return ret; + /* + * By this stage we know our supported CPUs on either DT/ACPI platforms, + * detect the SMT implementation. + */ + pmu->has_smt = topology_core_has_smt(cpumask_first(&pmu->supported_cpus)); + if (!pmu->set_event_filter) pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE; diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 69c5cc8f5606..d1d6000517b2 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -981,6 +981,7 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc, struct perf_event *event) { + struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; @@ -1001,6 +1002,15 @@ static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc, if (has_branch_stack(event)) return false; + /* + * The PMCCNTR_EL0 increments from the processor clock rather than + * the PE clock (ARM DDI0487 L.b D13.1.3) which means it'll continue + * counting on a WFI PE if one of its SMT sibling is not idle on a + * multi-threaded implementation. So don't use it on SMT cores. + */ + if (cpu_pmu->has_smt) + return false; + return true; } diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d72d6e5aa200..daa1af2e8204 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -89,6 +89,17 @@ void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); void freq_inv_set_max_ratio(int cpu, u64 max_rate); + +/* + * Architectures like ARM64 don't have reliable architectural way to get SMT + * information and depend on the firmware (ACPI/OF) report. Non-SMT core won't + * initialize thread_id so we can use this to detect the SMT implementation. + */ +static inline bool topology_core_has_smt(int cpu) +{ + return cpu_topology[cpu].thread_id != -1; +} + #endif #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 93c9a26492fc..2d39322c40c4 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -119,6 +119,7 @@ struct arm_pmu { /* PMUv3 only */ int pmuver; + bool has_smt; u64 reg_pmmir; u64 reg_brbidr; #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 -- cgit v1.2.3 From 56d9df41ef1847ed0523f57ec6117649d581401d Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sat, 1 Nov 2025 01:45:04 +0100 Subject: rtc: ds1685: stop setting max_user_freq max_user_freq has not been related to the hardware RTC since commit 6610e0893b8b ("RTC: Rework RTC code to use timerqueue for events"). Stop setting it from individual driver to avoid confusing new contributors. Acked-by: Joshua Kinard Link: https://patch.msgid.link/20251101-max_user_freq-v1-2-c9a274fd6883@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1685.c | 3 --- include/linux/rtc/ds1685.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c index 97423f1d0361..5fc8e36b1abf 100644 --- a/drivers/rtc/rtc-ds1685.c +++ b/drivers/rtc/rtc-ds1685.c @@ -1268,9 +1268,6 @@ ds1685_rtc_probe(struct platform_device *pdev) rtc_dev->range_min = RTC_TIMESTAMP_BEGIN_2000; rtc_dev->range_max = RTC_TIMESTAMP_END_2099; - /* Maximum periodic rate is 8192Hz (0.122070ms). */ - rtc_dev->max_user_freq = RTC_MAX_USER_FREQ; - /* See if the platform doesn't support UIE. */ if (pdata->uie_unsupported) clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc_dev->features); diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h index 01da4582db6d..8ec0ebfaef04 100644 --- a/include/linux/rtc/ds1685.h +++ b/include/linux/rtc/ds1685.h @@ -324,7 +324,6 @@ struct ds1685_rtc_platform_data { #define RTC_SQW_2HZ 0x0f /* 0 1 1 1 1 */ #define RTC_SQW_0HZ 0x00 /* 0 0 0 0 0 */ #define RTC_SQW_32768HZ 32768 /* 1 - - - - */ -#define RTC_MAX_USER_FREQ 8192 /* -- cgit v1.2.3 From 3eb6660f26d13acdbcb9241ac3e95d44419f2284 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 Oct 2025 10:40:52 +0100 Subject: uaccess: Provide ASM GOTO safe wrappers for unsafe_*_user() ASM GOTO is miscompiled by GCC when it is used inside a auto cleanup scope: bool foo(u32 __user *p, u32 val) { scoped_guard(pagefault) unsafe_put_user(val, p, efault); return true; efault: return false; } e80: e8 00 00 00 00 call e85 e85: 65 48 8b 05 00 00 00 00 mov %gs:0x0(%rip),%rax e8d: 83 80 04 14 00 00 01 addl $0x1,0x1404(%rax) // pf_disable++ e94: 89 37 mov %esi,(%rdi) e96: 83 a8 04 14 00 00 01 subl $0x1,0x1404(%rax) // pf_disable-- e9d: b8 01 00 00 00 mov $0x1,%eax // success ea2: e9 00 00 00 00 jmp ea7 // ret ea7: 31 c0 xor %eax,%eax // fail ea9: e9 00 00 00 00 jmp eae // ret which is broken as it leaks the pagefault disable counter on failure. Clang at least fails the build. Linus suggested to add a local label into the macro scope and let that jump to the actual caller supplied error label. __label__ local_label; \ arch_unsafe_get_user(x, ptr, local_label); \ if (0) { \ local_label: \ goto label; \ That works for both GCC and clang. clang: c80: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) c85: 65 48 8b 0c 25 00 00 00 00 mov %gs:0x0,%rcx c8e: ff 81 04 14 00 00 incl 0x1404(%rcx) // pf_disable++ c94: 31 c0 xor %eax,%eax // set retval to false c96: 89 37 mov %esi,(%rdi) // write c98: b0 01 mov $0x1,%al // set retval to true c9a: ff 89 04 14 00 00 decl 0x1404(%rcx) // pf_disable-- ca0: 2e e9 00 00 00 00 cs jmp ca6 // ret The exception table entry points correctly to c9a GCC: f70: e8 00 00 00 00 call f75 f75: 65 48 8b 05 00 00 00 00 mov %gs:0x0(%rip),%rax f7d: 83 80 04 14 00 00 01 addl $0x1,0x1404(%rax) // pf_disable++ f84: 8b 17 mov (%rdi),%edx f86: 89 16 mov %edx,(%rsi) f88: 83 a8 04 14 00 00 01 subl $0x1,0x1404(%rax) // pf_disable-- f8f: b8 01 00 00 00 mov $0x1,%eax // success f94: e9 00 00 00 00 jmp f99 // ret f99: 83 a8 04 14 00 00 01 subl $0x1,0x1404(%rax) // pf_disable-- fa0: 31 c0 xor %eax,%eax // fail fa2: e9 00 00 00 00 jmp fa7 // ret The exception table entry points correctly to f99 So both compilers optimize out the extra goto and emit correct and efficient code. Provide a generic wrapper to do that to avoid modifying all the affected architecture specific implementation with that workaround. The only change required for architectures is to rename unsafe_*_user() to arch_unsafe_*_user(). That's done in subsequent changes. Suggested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/877bweujtn.ffs@tglx --- include/linux/uaccess.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1beb5b395d81..8aa82b1d6013 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -518,7 +518,34 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); -#ifndef __get_kernel_nofault +#ifdef arch_get_kernel_nofault +/* + * Wrap the architecture implementation so that @label can be outside of a + * cleanup() scope. A regular C goto works correctly, but ASM goto does + * not. Clang rejects such an attempt, but GCC silently emits buggy code. + */ +#define __get_kernel_nofault(dst, src, type, label) \ +do { \ + __label__ local_label; \ + arch_get_kernel_nofault(dst, src, type, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, label) \ +do { \ + __label__ local_label; \ + arch_put_kernel_nofault(dst, src, type, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#elif !defined(__get_kernel_nofault) /* arch_get_kernel_nofault */ + #define __get_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(src); \ @@ -535,7 +562,8 @@ do { \ if (__put_user(data, p)) \ goto label; \ } while (0) -#endif + +#endif /* !__get_kernel_nofault */ /** * get_kernel_nofault(): safely attempt to read from a location @@ -549,7 +577,42 @@ do { \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) -#ifndef user_access_begin +#ifdef user_access_begin + +#ifdef arch_unsafe_get_user +/* + * Wrap the architecture implementation so that @label can be outside of a + * cleanup() scope. A regular C goto works correctly, but ASM goto does + * not. Clang rejects such an attempt, but GCC silently emits buggy code. + * + * Some architectures use internal local labels already, but this extra + * indirection here is harmless because the compiler optimizes it out + * completely in any case. This construct just ensures that the ASM GOTO + * target is always in the local scope. The C goto 'label' works correctly + * when leaving a cleanup() scope. + */ +#define unsafe_get_user(x, ptr, label) \ +do { \ + __label__ local_label; \ + arch_unsafe_get_user(x, ptr, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#define unsafe_put_user(x, ptr, label) \ +do { \ + __label__ local_label; \ + arch_unsafe_put_user(x, ptr, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) +#endif /* arch_unsafe_get_user */ + +#else /* user_access_begin */ #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) @@ -559,7 +622,8 @@ do { \ #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } -#endif +#endif /* !user_access_begin */ + #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end -- cgit v1.2.3 From c33e779aba6804778c1440192a8033a145ba588d Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 31 Oct 2025 14:34:29 -0600 Subject: io_uring: add wrapper type for io_req_tw_func_t arg In preparation for uring_cmd implementations to implement functions with the io_req_tw_func_t signature, introduce a wrapper struct io_tw_req to hide the struct io_kiocb * argument. The intention is for only the io_uring core to access the inner struct io_kiocb *. uring_cmd implementations should instead call a helper from io_uring/cmd.h to convert struct io_tw_req to struct io_uring_cmd *. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 +++++- io_uring/futex.c | 16 +++++++++------- io_uring/io_uring.c | 21 ++++++++++++--------- io_uring/io_uring.h | 4 ++-- io_uring/msg_ring.c | 3 ++- io_uring/notif.c | 5 +++-- io_uring/poll.c | 11 ++++++----- io_uring/poll.h | 2 +- io_uring/rw.c | 5 +++-- io_uring/rw.h | 2 +- io_uring/timeout.c | 18 +++++++++++------- io_uring/uring_cmd.c | 3 ++- io_uring/waitid.c | 7 ++++--- 13 files changed, 61 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 25ee982eb435..f064a438ce43 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -615,7 +615,11 @@ enum { REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); +struct io_tw_req { + struct io_kiocb *req; +}; + +typedef void (*io_req_tw_func_t)(struct io_tw_req tw_req, io_tw_token_t tw); struct io_task_work { struct llist_node node; diff --git a/io_uring/futex.c b/io_uring/futex.c index 64f3bd51c84c..4e022c76236d 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -41,24 +41,26 @@ void io_futex_cache_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->futex_cache, kfree); } -static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) +static void __io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - hlist_del_init(&req->hash_node); - io_req_task_complete(req, tw); + hlist_del_init(&tw_req.req->hash_node); + io_req_task_complete(tw_req, tw); } -static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) +static void io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, tw); io_cache_free(&ctx->futex_cache, req->async_data); io_req_async_data_clear(req, 0); - __io_futex_complete(req, tw); + __io_futex_complete(tw_req, tw); } -static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) +static void io_futexv_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; @@ -73,7 +75,7 @@ static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) } io_req_async_data_free(req); - __io_futex_complete(req, tw); + __io_futex_complete(tw_req, tw); } static bool io_futexv_claim(struct io_futex *iof) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4e6676ac4662..01631b6ff442 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -291,7 +291,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) mutex_lock(&ctx->uring_lock); ts.cancel = io_should_terminate_tw(ctx); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, ts); + req->io_task_work.func((struct io_tw_req){req}, ts); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -539,9 +539,9 @@ static void io_queue_iowq(struct io_kiocb *req) io_wq_enqueue(tctx->io_wq, &req->work); } -static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) +static void io_req_queue_iowq_tw(struct io_tw_req tw_req, io_tw_token_t tw) { - io_queue_iowq(req); + io_queue_iowq(tw_req.req); } void io_req_queue_iowq(struct io_kiocb *req) @@ -1166,7 +1166,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, ts); + (struct io_tw_req){req}, ts); node = next; (*count)++; if (unlikely(need_resched())) { @@ -1389,7 +1389,7 @@ static int __io_run_local_work_loop(struct llist_node **node, io_task_work.node); INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, tw); + (struct io_tw_req){req}, tw); *node = next; if (++ret >= events) break; @@ -1459,14 +1459,17 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, return ret; } -static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) +static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; + io_tw_lock(req->ctx, tw); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) +void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, tw); @@ -1702,9 +1705,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) return 0; } -void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw) +void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - io_req_complete_defer(req); + io_req_complete_defer(tw_req.req); } /* diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 44b8091c7fcd..f97356ce29d0 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -149,9 +149,9 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); void io_req_task_queue(struct io_kiocb *req); -void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); +void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw); +void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw); struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 5e5b94236d72..7063ea7964e7 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -70,8 +70,9 @@ static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) return target_ctx->task_complete; } -static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) +static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); diff --git a/io_uring/notif.c b/io_uring/notif.c index d8ba1165c949..9960bb2a32d5 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -11,8 +11,9 @@ static const struct ubuf_info_ops io_ubuf_ops; -static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) +static void io_notif_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *notif = tw_req.req; struct io_notif_data *nd = io_notif_to_data(notif); struct io_ring_ctx *ctx = notif->ctx; @@ -34,7 +35,7 @@ static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) } nd = nd->next; - io_req_task_complete(notif, tw); + io_req_task_complete((struct io_tw_req){notif}, tw); } while (nd); } diff --git a/io_uring/poll.c b/io_uring/poll.c index c403e751841a..8aa4e3a31e73 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -310,8 +310,9 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) return IOU_POLL_NO_ACTION; } -void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) +void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; int ret; ret = io_poll_check_events(req, tw); @@ -332,7 +333,7 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, tw); + io_req_task_submit(tw_req, tw); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -340,14 +341,14 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); } else { io_tw_lock(req->ctx, tw); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, tw); + io_req_task_submit(tw_req, tw); else io_req_defer_failed(req, ret); } diff --git a/io_uring/poll.h b/io_uring/poll.h index c8438286dfa0..5647c5138932 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -46,4 +46,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); -void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw); +void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw); diff --git a/io_uring/rw.c b/io_uring/rw.c index 5b2241a5813c..828ac4f902b4 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -564,8 +564,9 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) +void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -581,7 +582,7 @@ void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); io_req_rw_cleanup(req, 0); - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); } static void io_complete_rw(struct kiocb *kiocb, long res) diff --git a/io_uring/rw.h b/io_uring/rw.h index 129a53fe5482..9bd7fbf70ea9 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -46,7 +46,7 @@ int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags); int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); -void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); +void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); void io_rw_cache_free(const void *entry); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 444142ba9d04..d8fbbaf31cf3 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -68,8 +68,9 @@ static inline bool io_timeout_finish(struct io_timeout *timeout, static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); -static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) +static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; struct io_ring_ctx *ctx = req->ctx; @@ -85,7 +86,7 @@ static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) } } - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); } static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) @@ -157,8 +158,10 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_flush_killed_timeouts(&list, 0); } -static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) +static void io_req_tw_fail_links(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *link = tw_req.req; + io_tw_lock(link->ctx, tw); while (link) { struct io_kiocb *nxt = link->link; @@ -168,7 +171,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, tw); + io_req_task_complete((struct io_tw_req){link}, tw); link = nxt; } } @@ -317,8 +320,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) +static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret; @@ -335,11 +339,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) ret = -ECANCELED; } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); } } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 9d67a2a721aa..c09b99e91c86 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -113,8 +113,9 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) +static void io_uring_cmd_work(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; diff --git a/io_uring/waitid.c b/io_uring/waitid.c index c5e0d979903a..62f7f1f004a5 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -16,7 +16,7 @@ #include "waitid.h" #include "../kernel/exit.h" -static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw); +static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) @@ -194,8 +194,9 @@ static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) return true; } -static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) +static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; @@ -229,7 +230,7 @@ static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) } io_waitid_complete(req, ret); - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, -- cgit v1.2.3 From 20fb3d05a34b55c8ec28ec3d3555e70c5bc0c72d Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 31 Oct 2025 14:34:30 -0600 Subject: io_uring/uring_cmd: avoid double indirect call in task work dispatch io_uring task work dispatch makes an indirect call to struct io_kiocb's io_task_work.func field to allow running arbitrary task work functions. In the uring_cmd case, this calls io_uring_cmd_work(), which immediately makes another indirect call to struct io_uring_cmd's task_work_cb field. Change the uring_cmd task work callbacks to functions whose signatures match io_req_tw_func_t. Add a function io_uring_cmd_from_tw() to convert from the task work's struct io_tw_req argument to struct io_uring_cmd *. Define a constant IO_URING_CMD_TASK_WORK_ISSUE_FLAGS to avoid manufacturing issue_flags in the uring_cmd task work callbacks. Now uring_cmd task work dispatch makes a single indirect call to the uring_cmd implementation's callback. This also allows removing the task_work_cb field from struct io_uring_cmd, freeing up 8 bytes for future storage. Since fuse_uring_send_in_task() now has access to the io_tw_token_t, check its cancel field directly instead of relying on the IO_URING_F_TASK_DEAD issue flag. Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 6 ++++-- drivers/block/ublk_drv.c | 22 +++++++++++----------- drivers/nvme/host/ioctl.c | 7 ++++--- fs/btrfs/ioctl.c | 5 +++-- fs/fuse/dev_uring.c | 7 ++++--- include/linux/io_uring/cmd.h | 22 +++++++++++++--------- include/linux/io_uring_types.h | 1 - io_uring/uring_cmd.c | 18 ++---------------- 8 files changed, 41 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/block/ioctl.c b/block/ioctl.c index d7489a56b33c..4ed17c5a4acc 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -769,14 +769,16 @@ struct blk_iou_cmd { bool nowait; }; -static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) +static void blk_cmd_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); if (bic->res == -EAGAIN && bic->nowait) io_uring_cmd_issue_blocking(cmd); else - io_uring_cmd_done(cmd, bic->res, issue_flags); + io_uring_cmd_done(cmd, bic->res, + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); } static void bio_cmd_bio_end_io(struct bio *bio) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0c74a41a6753..e0c601128efa 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1302,10 +1302,9 @@ static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, return true; } -static void ublk_dispatch_req(struct ublk_queue *ubq, - struct request *req, - unsigned int issue_flags) +static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) { + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; @@ -1348,13 +1347,13 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); } -static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; - ublk_dispatch_req(ubq, pdu->req, issue_flags); + ublk_dispatch_req(ubq, pdu->req); } static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) @@ -1366,9 +1365,9 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); } -static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct request *rq = pdu->req_list; struct request *next; @@ -1376,7 +1375,7 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, do { next = rq->rq_next; rq->rq_next = NULL; - ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags); + ublk_dispatch_req(rq->mq_hctx->driver_data, rq); rq = next; } while (rq); } @@ -2523,9 +2522,10 @@ fail_put: return NULL; } -static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); if (ret != -EIOCBQUEUED) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index c212fa952c0f..4fa8400a5627 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -398,14 +398,15 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( return io_uring_cmd_to_pdu(ioucmd, struct nvme_uring_cmd_pdu); } -static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, - unsigned issue_flags) +static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *ioucmd = io_uring_cmd_from_tw(tw_req); struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); if (pdu->bio) blk_rq_unmap_user(pdu->bio); - io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8cb7d5a462ef..3171d9df0246 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4649,8 +4649,9 @@ struct io_btrfs_cmd { struct btrfs_uring_priv *priv; }; -static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) +static void btrfs_uring_read_finished(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); struct btrfs_uring_priv *priv = bc->priv; struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); @@ -4695,7 +4696,7 @@ out: btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - io_uring_cmd_done(cmd, ret, issue_flags); + io_uring_cmd_done(cmd, ret, IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); add_rchar(current, ret); for (index = 0; index < priv->nr_pages; index++) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index f6b12aebb8bb..f8c93dc45768 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1209,14 +1209,15 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, * User buffers are not mapped yet - the application does not have permission * to write to it - this has to be executed in ring task context. */ -static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw) { + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); struct fuse_ring_queue *queue = ent->queue; int err; - if (!(issue_flags & IO_URING_F_TASK_DEAD)) { + if (!tw.cancel) { err = fuse_uring_prepare_send(ent, ent->fuse_req); if (err) { fuse_uring_next_fuse_req(ent, queue, issue_flags); diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 7509025b4071..375fd048c4cb 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -11,17 +11,13 @@ /* io_uring_cmd is being issued again */ #define IORING_URING_CMD_REISSUE (1U << 31) -typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, - unsigned issue_flags); - struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; - /* callback to defer completions to task context */ - io_uring_cmd_tw_t task_work_cb; u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ + u8 unused[8]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) @@ -60,7 +56,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, unsigned issue_flags, bool is_cqe32); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb, + io_req_tw_func_t task_work_cb, unsigned flags); /* @@ -109,7 +105,7 @@ static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb, unsigned flags) + io_req_tw_func_t task_work_cb, unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, @@ -132,15 +128,23 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, } #endif +static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) +{ + return io_kiocb_to_cmd(tw_req.req, struct io_uring_cmd); +} + +/* task_work executor checks the deferred list completion */ +#define IO_URING_CMD_TASK_WORK_ISSUE_FLAGS IO_URING_F_COMPLETE_DEFER + /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb) + io_req_tw_func_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb) + io_req_tw_func_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f064a438ce43..92780764d5fa 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -39,7 +39,6 @@ enum io_uring_cmd_flags { /* set when uring wants to cancel a previously issued command */ IO_URING_F_CANCEL = (1 << 11), IO_URING_F_COMPAT = (1 << 12), - IO_URING_F_TASK_DEAD = (1 << 13), }; struct io_wq_work_node { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index c09b99e91c86..197474911f04 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -113,21 +113,8 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -static void io_uring_cmd_work(struct io_tw_req tw_req, io_tw_token_t tw) -{ - struct io_kiocb *req = tw_req.req; - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned int flags = IO_URING_F_COMPLETE_DEFER; - - if (unlikely(tw.cancel)) - flags |= IO_URING_F_TASK_DEAD; - - /* task_work executor checks the deffered list completion */ - ioucmd->task_work_cb(ioucmd, flags); -} - void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb, + io_req_tw_func_t task_work_cb, unsigned flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -135,8 +122,7 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) return; - ioucmd->task_work_cb = task_work_cb; - req->io_task_work.func = io_uring_cmd_work; + req->io_task_work.func = task_work_cb; __io_req_task_work_add(req, flags); } EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); -- cgit v1.2.3 From 8627bc8c7d815d929ad59407e13458b564870acf Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 17:34:18 +0100 Subject: ns: add missing authorship I authored the files a short while ago. Signed-off-by: Christian Brauner --- include/linux/nstree.h | 1 + kernel/nscommon.c | 1 + kernel/nstree.c | 1 + 3 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 8b8636690473..43aa262c0ea1 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner */ #ifndef _LINUX_NSTREE_H #define _LINUX_NSTREE_H diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c1fb2bad6d72..238402b189f7 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner */ #include #include diff --git a/kernel/nstree.c b/kernel/nstree.c index 369fd1675c6a..4eabab5fceaf 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner */ #include #include -- cgit v1.2.3 From d915fe20e5cba4bd50e41e792a32dcddc7490e25 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 16:10:10 +0100 Subject: ns: add NS_COMMON_INIT() Add an initializer that can be used for the ns common initialization for static namespace such as most init namespaces. Suggested-by: Thomas Gleixner Link: https://patch.msgid.link/87ecqhy2y5.ffs@tglx Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index f5b68b8abb54..3a72c3f81eca 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -119,6 +119,16 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: CLONE_NEWUSER, \ struct uts_namespace *: CLONE_NEWUTS) +#define NS_COMMON_INIT(nsname, refs) \ +{ \ + .ns_type = ns_common_type(&nsname), \ + .ns_id = 0, \ + .inum = ns_init_inum(&nsname), \ + .ops = to_ns_operations(&nsname), \ + .stashed = NULL, \ + .__ns_ref = REFCOUNT_INIT(refs), \ +} + #define ns_common_init(__ns) \ __ns_common_init(to_ns_common(__ns), \ ns_common_type(__ns), \ -- cgit v1.2.3 From 3dd50c58664e2684bd610a57bf3ab713cbb0ea91 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:21 +0100 Subject: ns: initialize ns_list_node for initial namespaces Make sure that the list is always initialized for initial namespaces. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-8-2e6f823ebdc0@kernel.org Fixes: 885fc8ac0a4d ("nstree: make iterator generic") Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 3a72c3f81eca..71a5e28344d1 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -127,6 +127,7 @@ void __ns_common_free(struct ns_common *ns); .ops = to_ns_operations(&nsname), \ .stashed = NULL, \ .__ns_ref = REFCOUNT_INIT(refs), \ + .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ } #define ns_common_init(__ns) \ -- cgit v1.2.3 From 6b053576edb12c7739ea9c7c9900031361922631 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:22 +0100 Subject: ns: add __ns_ref_read() Implement ns_ref_read() the same way as ns_ref_{get,put}(). No point in making that any more special or different from the other helpers. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-9-2e6f823ebdc0@kernel.org Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 71a5e28344d1..5e09facafd93 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -154,7 +154,12 @@ static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) return refcount_inc_not_zero(&ns->__ns_ref); } -#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) +static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) +{ + return refcount_read(&ns->__ns_ref); +} + +#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns))) #define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) -- cgit v1.2.3 From 4b06b70c8244b442d58ae0fb59870cf31fdb422e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:23 +0100 Subject: ns: rename to exit_nsproxy_namespaces() The current naming is very misleading as this really isn't exiting all of the task's namespaces. It is only exiting the namespaces that hang of off nsproxy. Reflect that in the name. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-10-2e6f823ebdc0@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/nsproxy.h | 2 +- kernel/cgroup/cgroup.c | 6 +++--- kernel/exit.c | 2 +- kernel/fork.c | 2 +- kernel/nsproxy.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index bd118a187dec..538ba8dba184 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -93,7 +93,7 @@ static inline struct cred *nsset_cred(struct nsset *set) */ int copy_namespaces(u64 flags, struct task_struct *tsk); -void exit_task_namespaces(struct task_struct *tsk); +void exit_nsproxy_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index df8bfd26d502..b758a9dd7526 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1519,9 +1519,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void) } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() - * on a task which has already passed exit_task_namespaces() and - * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all - * cgroups visible for lookups. + * on a task which has already passed exit_nsproxy_namespaces() + * and nsproxy == NULL. Fall back to cgrp_dfl_root which will + * make all cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } diff --git a/kernel/exit.c b/kernel/exit.c index 9f74e8f1c431..825998103520 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -962,7 +962,7 @@ void __noreturn do_exit(long code) exit_fs(tsk); if (group_dead) disassociate_ctty(1); - exit_task_namespaces(tsk); + exit_nsproxy_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..0926bfe4b8df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2453,7 +2453,7 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - exit_task_namespaces(p); + exit_nsproxy_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { mm_clear_owner(p->mm, p); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 19aa64ab08c8..6ce76a0278ab 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -241,7 +241,7 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) put_nsproxy(ns); } -void exit_task_namespaces(struct task_struct *p) +void exit_nsproxy_namespaces(struct task_struct *p) { switch_task_namespaces(p, NULL); } -- cgit v1.2.3 From 3a18f809184bc5a1cfad7cde5b8b026e2ff61587 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:24 +0100 Subject: ns: add active reference count The namespace tree is, among other things, currently used to support file handles for namespaces. When a namespace is created it is placed on the namespace trees and when it is destroyed it is removed from the namespace trees. While a namespace is on the namespace trees with a valid reference count it is possible to reopen it through a namespace file handle. This is all fine but has some issues that should be addressed. On current kernels a namespace is visible to userspace in the following cases: (1) The namespace is in use by a task. (2) The namespace is persisted through a VFS object (namespace file descriptor or bind-mount). Note that (2) only cares about direct persistence of the namespace itself not indirectly via e.g., file->f_cred file references or similar. (3) The namespace is a hierarchical namespace type and is the parent of a single or multiple child namespaces. Case (3) is interesting because it is possible that a parent namespace might not fulfill any of (1) or (2), i.e., is invisible to userspace but it may still be resurrected through the NS_GET_PARENT ioctl(). Currently namespace file handles allow much broader access to namespaces than what is currently possible via (1)-(3). The reason is that namespaces may remain pinned for completely internal reasons yet are inaccessible to userspace. For example, a user namespace my remain pinned by get_cred() calls to stash the opener's credentials into file->f_cred. As it stands file handles allow to resurrect such a users namespace even though this should not be possible via (1)-(3). This is a fundamental uapi change that we shouldn't do if we don't have to. Consider the following insane case: Various architectures support the CONFIG_MMU_LAZY_TLB_REFCOUNT option which uses lazy TLB destruction. When this option is set a userspace task's struct mm_struct may be used for kernel threads such as the idle task and will only be destroyed once the cpu's runqueue switches back to another task. But because of ptrace() permission checks struct mm_struct stashes the user namespace of the task that struct mm_struct originally belonged to. The kernel thread will take a reference on the struct mm_struct and thus pin it. So on an idle system user namespaces can be persisted for arbitrary amounts of time which also means that they can be resurrected using namespace file handles. That makes no sense whatsoever. The problem is of course excarabted on large systems with a huge number of cpus. To handle this nicely we introduce an active reference count which tracks (1)-(3). This is easy to do as all of these things are already managed centrally. Only (1)-(3) will count towards the active reference count and only namespaces which are active may be opened via namespace file handles. The problem is that namespaces may be resurrected. Which means that they can become temporarily inactive and will be reactived some time later. Currently the only example of this is the SIOGCSKNS socket ioctl. The SIOCGSKNS ioctl allows to open a network namespace file descriptor based on a socket file descriptor. If a socket is tied to a network namespace that subsequently becomes inactive but that socket is persisted by another process in another network namespace (e.g., via SCM_RIGHTS of pidfd_getfd()) then the SIOCGSKNS ioctl will resurrect this network namespace. So calls to open_related_ns() and open_namespace() will end up resurrecting the corresponding namespace tree. Note that the active reference count does not regulate the lifetime of the namespace itself. This is still done by the normal reference count. The active reference count can only be elevated if the regular reference count is elevated. The active reference count also doesn't regulate the presence of a namespace on the namespace trees. It only regulates its visiblity to namespace file handles (and in later patches to listns()). A namespace remains on the namespace trees from creation until its actual destruction. This will allow the kernel to always reach any namespace trivially and it will also enable subsystems like bpf to walk the namespace lists on the system for tracing or general introspection purposes. Note that different namespaces have different visibility lifetimes on current kernels. While most namespace are immediately released when the last task using them exits, the user- and pid namespace are persisted and thus both remain accessible via /proc//ns/. The user namespace lifetime is aliged with struct cred and is only released through exit_creds(). However, it becomes inaccessible to userspace once the last task using it is reaped, i.e., when release_task() is called and all proc entries are flushed. Similarly, the pid namespace is also visible until the last task using it has been reaped and the associated pid numbers are freed. The active reference counts of the user- and pid namespace are decremented once the task is reaped. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-11-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- fs/nsfs.c | 48 ++++++++++- include/linux/ns_common.h | 141 +++++++++++++++++++++++++++++- include/linux/nsfs.h | 3 + include/linux/nsproxy.h | 3 + kernel/cred.c | 6 ++ kernel/exit.c | 1 + kernel/fork.c | 1 + kernel/nscommon.c | 214 +++++++++++++++++++++++++++++++++++++++++++++- kernel/nsproxy.c | 23 +++++ kernel/nstree.c | 8 ++ kernel/pid.c | 5 ++ 11 files changed, 449 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/nsfs.c b/fs/nsfs.c index 8b53fd361177..0c35e4e54711 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -58,6 +58,8 @@ const struct dentry_operations ns_dentry_operations = { static void nsfs_evict(struct inode *inode) { struct ns_common *ns = inode->i_private; + + __ns_ref_active_put(ns); clear_inode(inode); ns->ops->put(ns); } @@ -419,6 +421,16 @@ static int nsfs_init_inode(struct inode *inode, void *data) inode->i_mode |= S_IRUGO; inode->i_fop = &ns_file_operations; inode->i_ino = ns->inum; + + /* + * Bring the namespace subtree back to life if we have to. This + * can happen when e.g., all processes using a network namespace + * and all namespace files or namespace file bind-mounts have + * died but there are still sockets pinning it. The SIOCGSKNS + * ioctl on such a socket will resurrect the relevant namespace + * subtree. + */ + __ns_ref_active_resurrect(ns); return 0; } @@ -495,7 +507,17 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, if (ns->inum != fid->ns_inum) return NULL; - if (!__ns_ref_get(ns)) + /* + * This is racy because we're not actually taking an + * active reference. IOW, it could happen that the + * namespace becomes inactive after this check. + * We don't care because nsfs_init_inode() will just + * resurrect the relevant namespace tree for us. If it + * has been active here we just allow it's resurrection. + * We could try to take an active reference here and + * then drop it again. But really, why bother. + */ + if (!ns_get_unless_inactive(ns)) return NULL; } @@ -615,3 +637,27 @@ void __init nsfs_init(void) nsfs_root_path.mnt = nsfs_mnt; nsfs_root_path.dentry = nsfs_mnt->mnt_root; } + +void nsproxy_ns_active_get(struct nsproxy *ns) +{ + ns_ref_active_get(ns->mnt_ns); + ns_ref_active_get(ns->uts_ns); + ns_ref_active_get(ns->ipc_ns); + ns_ref_active_get(ns->pid_ns_for_children); + ns_ref_active_get(ns->cgroup_ns); + ns_ref_active_get(ns->net_ns); + ns_ref_active_get(ns->time_ns); + ns_ref_active_get(ns->time_ns_for_children); +} + +void nsproxy_ns_active_put(struct nsproxy *ns) +{ + ns_ref_active_put(ns->mnt_ns); + ns_ref_active_put(ns->uts_ns); + ns_ref_active_put(ns->ipc_ns); + ns_ref_active_put(ns->pid_ns_for_children); + ns_ref_active_put(ns->cgroup_ns); + ns_ref_active_put(ns->net_ns); + ns_ref_active_put(ns->time_ns); + ns_ref_active_put(ns->time_ns_for_children); +} diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 5e09facafd93..bdd0df15ad9c 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -4,7 +4,9 @@ #include #include +#include #include +#include struct proc_ns_operations; @@ -37,6 +39,67 @@ extern const struct proc_ns_operations cgroupns_operations; extern const struct proc_ns_operations timens_operations; extern const struct proc_ns_operations timens_for_children_operations; +/* + * Namespace lifetimes are managed via a two-tier reference counting model: + * + * (1) __ns_ref (refcount_t): Main reference count tracking memory + * lifetime. Controls when the namespace structure itself is freed. + * It also pins the namespace on the namespace trees whereas (2) + * only regulates their visibility to userspace. + * + * (2) __ns_ref_active (atomic_t): Reference count tracking active users. + * Controls visibility of the namespace in the namespace trees. + * Any live task that uses the namespace (via nsproxy or cred) holds + * an active reference. Any open file descriptor or bind-mount of + * the namespace holds an active reference. Once all tasks have + * called exited their namespaces and all file descriptors and + * bind-mounts have been released the active reference count drops + * to zero and the namespace becomes inactive. IOW, the namespace + * cannot be listed or opened via file handles anymore. + * + * Note that it is valid to transition from active to inactive and + * back from inactive to active e.g., when resurrecting an inactive + * namespace tree via the SIOCGSKNS ioctl(). + * + * Relationship and lifecycle states: + * + * - Active (__ns_ref_active > 0): + * Namespace is actively used and visible to userspace. The namespace + * can be reopened via /proc//ns/, via namespace file + * handles, or discovered via listns(). + * + * - Inactive (__ns_ref_active == 0, __ns_ref > 0): + * No tasks are actively using the namespace and it isn't pinned by + * any bind-mounts or open file descriptors anymore. But the namespace + * is still kept alive by internal references. For example, the user + * namespace could be pinned by an open file through file->f_cred + * references when one of the now defunct tasks had opened a file and + * handed the file descriptor off to another process via a UNIX + * sockets. Such references keep the namespace structure alive through + * __ns_ref but will not hold an active reference. + * + * - Destroyed (__ns_ref == 0): + * No references remain. The namespace is removed from the tree and freed. + * + * State transitions: + * + * Active -> Inactive: + * When the last task using the namespace exits it drops its active + * references to all namespaces. However, user and pid namespaces + * remain accessible until the task has been reaped. + * + * Inactive -> Active: + * An inactive namespace tree might be resurrected due to e.g., the + * SIOCGSKNS ioctl() on a socket. + * + * Inactive -> Destroyed: + * When __ns_ref drops to zero the namespace is removed from the + * namespaces trees and the memory is freed (after RCU grace period). + * + * Initial namespaces: + * Boot-time namespaces (init_net, init_pid_ns, etc.) start with + * __ns_ref_active = 1 and remain active forever. + */ struct ns_common { u32 ns_type; struct dentry *stashed; @@ -48,6 +111,7 @@ struct ns_common { u64 ns_id; struct rb_node ns_tree_node; struct list_head ns_list_node; + atomic_t __ns_ref_active; /* do not use directly */ }; struct rcu_head ns_rcu; }; @@ -56,6 +120,13 @@ struct ns_common { int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); +static __always_inline bool is_initial_namespace(struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->inum == 0); + return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, + IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); +} + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ @@ -127,6 +198,7 @@ void __ns_common_free(struct ns_common *ns); .ops = to_ns_operations(&nsname), \ .stashed = NULL, \ .__ns_ref = REFCOUNT_INIT(refs), \ + .__ns_ref_active = ATOMIC_INIT(1), \ .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ } @@ -144,14 +216,26 @@ void __ns_common_free(struct ns_common *ns); #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) +{ + return atomic_read(&ns->__ns_ref_active); +} + static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - return refcount_dec_and_test(&ns->__ns_ref); + if (refcount_dec_and_test(&ns->__ns_ref)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); + return true; + } + return false; } static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - return refcount_inc_not_zero(&ns->__ns_ref); + if (refcount_inc_not_zero(&ns->__ns_ref)) + return true; + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); + return false; } static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) @@ -166,4 +250,57 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns #define ns_ref_put_and_lock(__ns, __lock) \ refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) +#define ns_ref_active_read(__ns) \ + ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) + +void __ns_ref_active_get_owner(struct ns_common *ns); + +static __always_inline void __ns_ref_active_get(struct ns_common *ns) +{ + WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); + VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0); +} +#define ns_ref_active_get(__ns) \ + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) + +static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) +{ + if (atomic_inc_not_zero(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + return true; + } + return false; +} + +#define ns_ref_active_get_owner(__ns) \ + do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0) + +void __ns_ref_active_put_owner(struct ns_common *ns); + +static __always_inline void __ns_ref_active_put(struct ns_common *ns) +{ + if (atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(is_initial_namespace(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + __ns_ref_active_put_owner(ns); + } +} +#define ns_ref_active_put(__ns) \ + do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) + +static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); + if (!__ns_ref_active_read(ns)) + return NULL; + if (!__ns_ref_get(ns)) + return NULL; + return ns; +} + +void __ns_ref_active_resurrect(struct ns_common *ns); + +#define ns_ref_active_resurrect(__ns) \ + do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0) + #endif diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h index e5a5fa83d36b..731b67fc2fec 100644 --- a/include/linux/nsfs.h +++ b/include/linux/nsfs.h @@ -37,4 +37,7 @@ void nsfs_init(void); #define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) +void nsproxy_ns_active_get(struct nsproxy *ns); +void nsproxy_ns_active_put(struct nsproxy *ns); + #endif /* _LINUX_NSFS_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 538ba8dba184..ac825eddec59 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -93,7 +93,10 @@ static inline struct cred *nsset_cred(struct nsset *set) */ int copy_namespaces(u64 flags, struct task_struct *tsk); +void switch_cred_namespaces(const struct cred *old, const struct cred *new); void exit_nsproxy_namespaces(struct task_struct *tsk); +void get_cred_namespaces(struct task_struct *tsk); +void exit_cred_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); diff --git a/kernel/cred.c b/kernel/cred.c index dbf6b687dc5c..a6e7f580df14 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -306,6 +306,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags) kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); return 0; } @@ -343,6 +344,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags) p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); + return 0; error_put: @@ -435,10 +438,13 @@ int commit_creds(struct cred *new) */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); + rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); + if (new->user_ns != old->user_ns) + switch_cred_namespaces(old, new); /* send notifications */ if (!uid_eq(new->uid, old->uid) || diff --git a/kernel/exit.c b/kernel/exit.c index 825998103520..988e16efd66b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -291,6 +291,7 @@ repeat: write_unlock_irq(&tasklist_lock); /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); + exit_cred_namespaces(p); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); diff --git a/kernel/fork.c b/kernel/fork.c index 0926bfe4b8df..f1857672426e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2487,6 +2487,7 @@ bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + exit_cred_namespaces(p); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 238402b189f7..abd1ac1a2d02 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -3,6 +3,7 @@ #include #include +#include #include #ifdef CONFIG_DEBUG_VFS @@ -53,6 +54,8 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { + int ret; + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; @@ -69,10 +72,219 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope ns->inum = inum; return 0; } - return proc_alloc_inum(&ns->inum); + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + /* + * Tree ref starts at 0. It's incremented when namespace enters + * active use (installed in nsproxy) and decremented when all + * active uses are gone. Initial namespaces are always active. + */ + if (is_initial_namespace(ns)) + atomic_set(&ns->__ns_ref_active, 1); + else + atomic_set(&ns->__ns_ref_active, 0); + return 0; } void __ns_common_free(struct ns_common *ns) { proc_free_inum(ns->inum); } + +static struct ns_common *ns_owner(struct ns_common *ns) +{ + struct user_namespace *owner; + + if (unlikely(!ns->ops)) + return NULL; + VFS_WARN_ON_ONCE(!ns->ops->owner); + owner = ns->ops->owner(ns); + VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); + if (!owner) + return NULL; + /* Skip init_user_ns as it's always active */ + if (owner == &init_user_ns) + return NULL; + return to_ns_common(owner); +} + +void __ns_ref_active_get_owner(struct ns_common *ns) +{ + ns = ns_owner(ns); + if (ns) + WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * The iteration stops once we reach a namespace that still has active + * references. + */ +void __ns_ref_active_put_owner(struct ns_common *ns) +{ + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + if (!atomic_dec_and_test(&ns->__ns_ref_active)) + return; + } +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. This makes it possible to efficiently + * resurrect a namespace tree: + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Assume the whole tree is dead but all namespaces are still active: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - - - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): + * + * net_ns pid_ns + * \ / + * + - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - + - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If net_ns had a zero reference count and we bumped it we also need to + * take another reference on its owning user namespace. Similarly, if + * pid_ns had a zero reference count it also needs to take another + * reference on its owning user namespace. So both net_ns and pid_ns + * will each have their own reference on the owning user namespace. + * + * If the owning user namespace user_ns1 had a zero reference count then + * it also needs to take another reference on its owning user namespace + * and so on. + */ +void __ns_ref_active_resurrect(struct ns_common *ns) +{ + /* If we didn't resurrect the namespace we're done. */ + if (atomic_fetch_add(1, &ns->__ns_ref_active)) + return; + + /* + * We did resurrect it. Walk the ownership hierarchy upwards + * until we found an owning user namespace that is active. + */ + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + + if (atomic_fetch_add(1, &ns->__ns_ref_active)) + return; + } +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 6ce76a0278ab..94c2cfe0afa1 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,6 +26,7 @@ #include #include #include +#include static struct kmem_cache *nsproxy_cachep; @@ -179,12 +180,15 @@ int copy_namespaces(u64 flags, struct task_struct *tsk) if ((flags & CLONE_VM) == 0) timens_on_fork(new_ns, tsk); + nsproxy_ns_active_get(new_ns); tsk->nsproxy = new_ns; return 0; } void free_nsproxy(struct nsproxy *ns) { + nsproxy_ns_active_put(ns); + put_mnt_ns(ns->mnt_ns); put_uts_ns(ns->uts_ns); put_ipc_ns(ns->ipc_ns); @@ -232,6 +236,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + if (new) + nsproxy_ns_active_get(new); + task_lock(p); ns = p->nsproxy; p->nsproxy = new; @@ -246,6 +253,22 @@ void exit_nsproxy_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +void switch_cred_namespaces(const struct cred *old, const struct cred *new) +{ + ns_ref_active_get(new->user_ns); + ns_ref_active_put(old->user_ns); +} + +void get_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_get(tsk->real_cred->user_ns); +} + +void exit_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_put(tsk->real_cred->user_ns); +} + int exec_task_namespaces(void) { struct task_struct *tsk = current; diff --git a/kernel/nstree.c b/kernel/nstree.c index 4eabab5fceaf..e2a537785128 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -123,6 +123,14 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_sequnlock(&ns_tree->ns_tree_lock); VFS_WARN_ON_ONCE(node); + + /* + * Take an active reference on the owner namespace. This ensures + * that the owner remains visible while any of its child namespaces + * are active. For init namespaces this is a no-op as ns_owner() + * returns NULL for namespaces owned by init_user_ns. + */ + __ns_ref_active_get_owner(ns); } void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) diff --git a/kernel/pid.c b/kernel/pid.c index 19d4599c136c..a5a63dc0a491 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -112,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { int i; + struct pid_namespace *active_ns; lockdep_assert_not_held(&tasklist_lock); + active_ns = pid->numbers[pid->level].ns; + ns_ref_active_put(active_ns); + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; @@ -278,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } spin_unlock(&pidmap_lock); idr_preload_end(); + ns_ref_active_get(ns); return pid; -- cgit v1.2.3 From 8895d2a3dbf49f23622ab8da9fb3909826edd6dc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:25 +0100 Subject: ns: use anonymous struct to group list member Make it easier to spot that they belong together conceptually. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-12-2e6f823ebdc0@kernel.org Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index bdd0df15ad9c..32463203c824 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -109,8 +109,10 @@ struct ns_common { union { struct { u64 ns_id; - struct rb_node ns_tree_node; - struct list_head ns_list_node; + struct /* per type rbtree and list */ { + struct rb_node ns_tree_node; + struct list_head ns_list_node; + }; atomic_t __ns_ref_active; /* do not use directly */ }; struct rcu_head ns_rcu; -- cgit v1.2.3 From 2ccaebc686e9ef7e94b3a8d89706daed6e696667 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:26 +0100 Subject: nstree: introduce a unified tree This will allow userspace to lookup and stat a namespace simply by its identifier without having to know what type of namespace it is. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-13-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 3 ++ kernel/nscommon.c | 1 + kernel/nstree.c | 96 +++++++++++++++++++++++++++++++++++------------ 3 files changed, 76 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 32463203c824..7a3c71b3a76f 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -109,6 +109,9 @@ struct ns_common { union { struct { u64 ns_id; + struct /* global namespace rbtree and list */ { + struct rb_node ns_unified_tree_node; + }; struct /* per type rbtree and list */ { struct rb_node ns_tree_node; struct list_head ns_list_node; diff --git a/kernel/nscommon.c b/kernel/nscommon.c index abd1ac1a2d02..17a6ea44f054 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -62,6 +62,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope ns->ns_id = 0; ns->ns_type = ns_type; RB_CLEAR_NODE(&ns->ns_tree_node); + RB_CLEAR_NODE(&ns->ns_unified_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); #ifdef CONFIG_DEBUG_VFS diff --git a/kernel/nstree.c b/kernel/nstree.c index e2a537785128..bbb34b46b01b 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -5,31 +5,30 @@ #include #include +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); +static struct rb_root ns_unified_tree = RB_ROOT; /* protected by ns_tree_lock */ + /** * struct ns_tree - Namespace tree * @ns_tree: Rbtree of namespaces of a particular type * @ns_list: Sequentially walkable list of all namespaces of this type - * @ns_tree_lock: Seqlock to protect the tree and list * @type: type of namespaces in this tree */ struct ns_tree { - struct rb_root ns_tree; - struct list_head ns_list; - seqlock_t ns_tree_lock; - int type; + struct rb_root ns_tree; + struct list_head ns_list; + int type; }; struct ns_tree mnt_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), .type = CLONE_NEWNS, }; struct ns_tree net_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), .type = CLONE_NEWNET, }; EXPORT_SYMBOL_GPL(net_ns_tree); @@ -37,42 +36,36 @@ EXPORT_SYMBOL_GPL(net_ns_tree); struct ns_tree uts_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), .type = CLONE_NEWUTS, }; struct ns_tree user_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), .type = CLONE_NEWUSER, }; struct ns_tree ipc_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), .type = CLONE_NEWIPC, }; struct ns_tree pid_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), .type = CLONE_NEWPID, }; struct ns_tree cgroup_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), .type = CLONE_NEWCGROUP, }; struct ns_tree time_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), .type = CLONE_NEWTIME, }; @@ -85,6 +78,13 @@ static inline struct ns_common *node_to_ns(const struct rb_node *node) return rb_entry(node, struct ns_common, ns_tree_node); } +static inline struct ns_common *node_to_ns_unified(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_unified_tree_node); +} + static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) { struct ns_common *ns_a = node_to_ns(a); @@ -99,15 +99,27 @@ static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) return 0; } +static inline int ns_cmp_unified(struct rb_node *a, const struct rb_node *b) +{ + struct ns_common *ns_a = node_to_ns_unified(a); + struct ns_common *ns_b = node_to_ns_unified(b); + u64 ns_id_a = ns_a->ns_id; + u64 ns_id_b = ns_b->ns_id; + + if (ns_id_a < ns_id_b) + return -1; + if (ns_id_a > ns_id_b) + return 1; + return 0; +} + void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) { struct rb_node *node, *prev; VFS_WARN_ON_ONCE(!ns->ns_id); - write_seqlock(&ns_tree->ns_tree_lock); - - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + write_seqlock(&ns_tree_lock); node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); /* @@ -120,7 +132,8 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) else list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); - write_sequnlock(&ns_tree->ns_tree_lock); + rb_find_add_rcu(&ns->ns_unified_tree_node, &ns_unified_tree, ns_cmp_unified); + write_sequnlock(&ns_tree_lock); VFS_WARN_ON_ONCE(node); @@ -139,11 +152,12 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); - write_seqlock(&ns_tree->ns_tree_lock); + write_seqlock(&ns_tree_lock); rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); + rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree); list_bidir_del_rcu(&ns->ns_list_node); RB_CLEAR_NODE(&ns->ns_tree_node); - write_sequnlock(&ns_tree->ns_tree_lock); + write_sequnlock(&ns_tree_lock); } EXPORT_SYMBOL_GPL(__ns_tree_remove); @@ -159,6 +173,17 @@ static int ns_find(const void *key, const struct rb_node *node) return 0; } +static int ns_find_unified(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns_unified(node); + + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} static struct ns_tree *ns_tree_from_type(int ns_type) { @@ -184,28 +209,51 @@ static struct ns_tree *ns_tree_from_type(int ns_type) return NULL; } -struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id) { - struct ns_tree *ns_tree; struct rb_node *node; unsigned int seq; - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + do { + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_unified_tree, ns_find_unified); + if (node) + break; + } while (read_seqretry(&ns_tree_lock, seq)); + + return node_to_ns_unified(node); +} + +static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree *ns_tree; + struct rb_node *node; + unsigned int seq; ns_tree = ns_tree_from_type(ns_type); if (!ns_tree) return NULL; do { - seq = read_seqbegin(&ns_tree->ns_tree_lock); + seq = read_seqbegin(&ns_tree_lock); node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); if (node) break; - } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + } while (read_seqretry(&ns_tree_lock, seq)); return node_to_ns(node); } +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + + if (ns_type) + return __ns_tree_lookup_rcu(ns_id, ns_type); + + return __ns_unified_tree_lookup_rcu(ns_id); +} + /** * ns_tree_adjoined_rcu - find the next/previous namespace in the same * tree -- cgit v1.2.3 From 3760342fd6312416491d536144e39297fa5b1950 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:28 +0100 Subject: nstree: assign fixed ids to the initial namespaces The initial set of namespace comes with fixed inode numbers making it easy for userspace to identify them solely based on that information. This has long preceeded anything here. Similarly, let's assign fixed namespace ids for the initial namespaces. Kill the cookie and use a sequentially increasing number. This has the nice side-effect that the owning user namespace will always have a namespace id that is smaller than any of it's descendant namespaces. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-15-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- include/linux/ns_common.h | 13 ++++++++++++- include/linux/nstree.h | 15 +++++++++++---- include/uapi/linux/nsfs.h | 14 ++++++++++++++ kernel/nstree.c | 13 ++++++++----- net/core/net_namespace.c | 2 +- 6 files changed, 47 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 7b78dd48b6c3..eded33eeb647 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4094,7 +4094,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a return ERR_PTR(ret); } if (!anon) - ns_tree_gen_id(&new_ns->ns); + ns_tree_gen_id(new_ns); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; init_waitqueue_head(&new_ns->poll); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7a3c71b3a76f..009a6dea724f 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -173,6 +173,17 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) struct user_namespace *: &init_user_ns, \ struct uts_namespace *: &init_uts_ns) +#define ns_init_id(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_ID, \ + struct ipc_namespace *: IPC_NS_INIT_ID, \ + struct mnt_namespace *: MNT_NS_INIT_ID, \ + struct net *: NET_NS_INIT_ID, \ + struct pid_namespace *: PID_NS_INIT_ID, \ + struct time_namespace *: TIME_NS_INIT_ID, \ + struct user_namespace *: USER_NS_INIT_ID, \ + struct uts_namespace *: UTS_NS_INIT_ID) + #define to_ns_operations(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ @@ -198,7 +209,7 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) #define NS_COMMON_INIT(nsname, refs) \ { \ .ns_type = ns_common_type(&nsname), \ - .ns_id = 0, \ + .ns_id = ns_init_id(&nsname), \ .inum = ns_init_inum(&nsname), \ .ops = to_ns_operations(&nsname), \ .stashed = NULL, \ diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 43aa262c0ea1..38674c6fa4f7 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -9,6 +9,7 @@ #include #include #include +#include extern struct ns_tree cgroup_ns_tree; extern struct ns_tree ipc_ns_tree; @@ -30,7 +31,11 @@ extern struct ns_tree uts_ns_tree; struct user_namespace *: &(user_ns_tree), \ struct uts_namespace *: &(uts_ns_tree)) -u64 ns_tree_gen_id(struct ns_common *ns); +#define ns_tree_gen_id(__ns) \ + __ns_tree_gen_id(to_ns_common(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) + +u64 __ns_tree_gen_id(struct ns_common *ns, u64 id); void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); @@ -38,9 +43,9 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, struct ns_tree *ns_tree, bool previous); -static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree, u64 id) { - ns_tree_gen_id(ns); + __ns_tree_gen_id(ns, id); __ns_tree_add_raw(ns, ns_tree); } @@ -60,7 +65,9 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) * This function assigns a new id to the namespace and adds it to the * appropriate namespace tree and list. */ -#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) +#define ns_tree_add(__ns) \ + __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) /** * ns_tree_remove - Remove a namespace from a namespace tree diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index e098759ec917..f8bc2aad74d6 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -67,4 +67,18 @@ struct nsfs_file_handle { #define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ #define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ +enum init_ns_id { + IPC_NS_INIT_ID = 1ULL, + UTS_NS_INIT_ID = 2ULL, + USER_NS_INIT_ID = 3ULL, + PID_NS_INIT_ID = 4ULL, + CGROUP_NS_INIT_ID = 5ULL, + TIME_NS_INIT_ID = 6ULL, + NET_NS_INIT_ID = 7ULL, + MNT_NS_INIT_ID = 8ULL, +#ifdef __KERNEL__ + NS_LAST_INIT_ID = MNT_NS_INIT_ID, +#endif +}; + #endif /* __LINUX_NSFS_H */ diff --git a/kernel/nstree.c b/kernel/nstree.c index bbb34b46b01b..cf102c5bb849 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -69,8 +69,6 @@ struct ns_tree time_ns_tree = { .type = CLONE_NEWTIME, }; -DEFINE_COOKIE(namespace_cookie); - static inline struct ns_common *node_to_ns(const struct rb_node *node) { if (!node) @@ -285,15 +283,20 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, /** * ns_tree_gen_id - generate a new namespace id * @ns: namespace to generate id for + * @id: if non-zero, this is the initial namespace and this is a fixed id * * Generates a new namespace id and assigns it to the namespace. All * namespaces types share the same id space and thus can be compared * directly. IOW, when two ids of two namespace are equal, they are * identical. */ -u64 ns_tree_gen_id(struct ns_common *ns) +u64 __ns_tree_gen_id(struct ns_common *ns, u64 id) { - guard(preempt)(); - ns->ns_id = gen_cookie_next(&namespace_cookie); + static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1); + + if (id) + ns->ns_id = id; + else + ns->ns_id = atomic64_inc_return(&namespace_cookie); return ns->ns_id; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b0e0f22d7b21..83cbec4afcb3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -439,7 +439,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - net->net_cookie = ns_tree_gen_id(&net->ns); + net->net_cookie = ns_tree_gen_id(net); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); -- cgit v1.2.3 From 3c1a52f2a6c865464babe7a85c2796aa31cc9744 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:29 +0100 Subject: nstree: maintain list of owned namespaces The namespace tree doesn't express the ownership concept of namespace appropriately. Maintain a list of directly owned namespaces per user namespace. This will allow userspace and the kernel to use the listns() system call to walk the namespace tree by owning user namespace. The rbtree is used to find the relevant namespace entry point which allows to continue iteration and the owner list can be used to walk the tree completely lock free. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-16-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 8 ++++++ kernel/nscommon.c | 4 +++ kernel/nstree.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 009a6dea724f..698aa2f7f486 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -116,6 +116,12 @@ struct ns_common { struct rb_node ns_tree_node; struct list_head ns_list_node; }; + struct /* namespace ownership rbtree and list */ { + struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */ + struct list_head ns_owner; /* list of namespaces owned by this namespace */ + struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */ + struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */ + }; atomic_t __ns_ref_active; /* do not use directly */ }; struct rcu_head ns_rcu; @@ -216,6 +222,8 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) .__ns_ref = REFCOUNT_INIT(refs), \ .__ns_ref_active = ATOMIC_INIT(1), \ .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ + .ns_owner_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_entry), \ + .ns_owner = LIST_HEAD_INIT(nsname.ns.ns_owner), \ } #define ns_common_init(__ns) \ diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 17a6ea44f054..f0b7971392d2 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -63,7 +63,11 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope ns->ns_type = ns_type; RB_CLEAR_NODE(&ns->ns_tree_node); RB_CLEAR_NODE(&ns->ns_unified_tree_node); + RB_CLEAR_NODE(&ns->ns_owner_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); + ns->ns_owner_tree = RB_ROOT; + INIT_LIST_HEAD(&ns->ns_owner); + INIT_LIST_HEAD(&ns->ns_owner_entry); #ifdef CONFIG_DEBUG_VFS ns_debug(ns, ops); diff --git a/kernel/nstree.c b/kernel/nstree.c index cf102c5bb849..1f54f914e30c 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -3,7 +3,9 @@ #include #include +#include #include +#include static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); static struct rb_root ns_unified_tree = RB_ROOT; /* protected by ns_tree_lock */ @@ -83,6 +85,13 @@ static inline struct ns_common *node_to_ns_unified(const struct rb_node *node) return rb_entry(node, struct ns_common, ns_unified_tree_node); } +static inline struct ns_common *node_to_ns_owner(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_owner_tree_node); +} + static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) { struct ns_common *ns_a = node_to_ns(a); @@ -111,11 +120,27 @@ static inline int ns_cmp_unified(struct rb_node *a, const struct rb_node *b) return 0; } +static inline int ns_cmp_owner(struct rb_node *a, const struct rb_node *b) +{ + struct ns_common *ns_a = node_to_ns_owner(a); + struct ns_common *ns_b = node_to_ns_owner(b); + u64 ns_id_a = ns_a->ns_id; + u64 ns_id_b = ns_b->ns_id; + + if (ns_id_a < ns_id_b) + return -1; + if (ns_id_a > ns_id_b) + return 1; + return 0; +} + void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) { struct rb_node *node, *prev; + const struct proc_ns_operations *ops = ns->ops; VFS_WARN_ON_ONCE(!ns->ns_id); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); write_seqlock(&ns_tree_lock); @@ -131,6 +156,30 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); rb_find_add_rcu(&ns->ns_unified_tree_node, &ns_unified_tree, ns_cmp_unified); + + if (ops) { + struct user_namespace *user_ns; + + VFS_WARN_ON_ONCE(!ops->owner); + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + /* Insert into owner's rbtree */ + rb_find_add_rcu(&ns->ns_owner_tree_node, &owner->ns_owner_tree, ns_cmp_owner); + + /* Insert into owner's list in sorted order */ + prev = rb_prev(&ns->ns_owner_tree_node); + if (!prev) + list_add_rcu(&ns->ns_owner_entry, &owner->ns_owner); + else + list_add_rcu(&ns->ns_owner_entry, &node_to_ns_owner(prev)->ns_owner_entry); + } else { + /* Only the initial user namespace doesn't have an owner. */ + VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns)); + } + } write_sequnlock(&ns_tree_lock); VFS_WARN_ON_ONCE(node); @@ -146,6 +195,9 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) { + const struct proc_ns_operations *ops = ns->ops; + struct user_namespace *user_ns; + VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); @@ -153,8 +205,22 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) write_seqlock(&ns_tree_lock); rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree); - list_bidir_del_rcu(&ns->ns_list_node); RB_CLEAR_NODE(&ns->ns_tree_node); + + list_bidir_del_rcu(&ns->ns_list_node); + + /* Remove from owner's rbtree if this namespace has an owner */ + if (ops) { + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + rb_erase(&ns->ns_owner_tree_node, &owner->ns_owner_tree); + RB_CLEAR_NODE(&ns->ns_owner_tree_node); + } + + list_bidir_del_rcu(&ns->ns_owner_entry); + } + write_sequnlock(&ns_tree_lock); } EXPORT_SYMBOL_GPL(__ns_tree_remove); -- cgit v1.2.3 From 560e25e70fa40ec69f97f14207bde9bc18bec9b8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:31 +0100 Subject: nstree: add unified namespace list Allow to walk the unified namespace list completely locklessly. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-18-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 ++ kernel/nscommon.c | 1 + kernel/nstree.c | 13 ++++++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 698aa2f7f486..3f05dd7d40c7 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -111,6 +111,7 @@ struct ns_common { u64 ns_id; struct /* global namespace rbtree and list */ { struct rb_node ns_unified_tree_node; + struct list_head ns_unified_list_node; }; struct /* per type rbtree and list */ { struct rb_node ns_tree_node; @@ -224,6 +225,7 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ .ns_owner_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_entry), \ .ns_owner = LIST_HEAD_INIT(nsname.ns.ns_owner), \ + .ns_unified_list_node = LIST_HEAD_INIT(nsname.ns.ns_unified_list_node), \ } #define ns_common_init(__ns) \ diff --git a/kernel/nscommon.c b/kernel/nscommon.c index f0b7971392d2..4cbe1ecc8df0 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -65,6 +65,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope RB_CLEAR_NODE(&ns->ns_unified_tree_node); RB_CLEAR_NODE(&ns->ns_owner_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); + INIT_LIST_HEAD(&ns->ns_unified_list_node); ns->ns_owner_tree = RB_ROOT; INIT_LIST_HEAD(&ns->ns_owner); INIT_LIST_HEAD(&ns->ns_owner_entry); diff --git a/kernel/nstree.c b/kernel/nstree.c index 419d500d09df..dcad6a308547 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -9,6 +9,7 @@ static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); static struct rb_root ns_unified_tree = RB_ROOT; /* protected by ns_tree_lock */ +static LIST_HEAD(ns_unified_list); /* protected by ns_tree_lock */ /** * struct ns_tree - Namespace tree @@ -137,7 +138,13 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) else list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + /* Add to unified tree and list */ rb_find_add_rcu(&ns->ns_unified_tree_node, &ns_unified_tree, ns_cmp_unified); + prev = rb_prev(&ns->ns_unified_tree_node); + if (!prev) + list_add_rcu(&ns->ns_unified_list_node, &ns_unified_list); + else + list_add_rcu(&ns->ns_unified_list_node, &node_to_ns_unified(prev)->ns_unified_list_node); if (ops) { struct user_namespace *user_ns; @@ -186,11 +193,15 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) write_seqlock(&ns_tree_lock); rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); - rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree); RB_CLEAR_NODE(&ns->ns_tree_node); list_bidir_del_rcu(&ns->ns_list_node); + rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree); + RB_CLEAR_NODE(&ns->ns_unified_tree_node); + + list_bidir_del_rcu(&ns->ns_unified_list_node); + /* Remove from owner's rbtree if this namespace has an owner */ if (ops) { user_ns = ops->owner(ns); -- cgit v1.2.3 From 76b6f5dfb3fda76fce1f9990d6fa58adc711122b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:32 +0100 Subject: nstree: add listns() Add a new listns() system call that allows userspace to iterate through namespaces in the system. This provides a programmatic interface to discover and inspect namespaces, enhancing existing namespace apis. Currently, there is no direct way for userspace to enumerate namespaces in the system. Applications must resort to scanning /proc//ns/ across all processes, which is: 1. Inefficient - requires iterating over all processes 2. Incomplete - misses inactive namespaces that aren't attached to any running process but are kept alive by file descriptors, bind mounts, or parent namespace references 3. Permission-heavy - requires access to /proc for many processes 4. No ordering or ownership. 5. No filtering per namespace type: Must always iterate and check all namespaces. The list goes on. The listns() system call solves these problems by providing direct kernel-level enumeration of namespaces. It is similar to listmount() but obviously tailored to namespaces. /* * @req: Pointer to struct ns_id_req specifying search parameters * @ns_ids: User buffer to receive namespace IDs * @nr_ns_ids: Size of ns_ids buffer (maximum number of IDs to return) * @flags: Reserved for future use (must be 0) */ ssize_t listns(const struct ns_id_req *req, u64 *ns_ids, size_t nr_ns_ids, unsigned int flags); Returns: - On success: Number of namespace IDs written to ns_ids - On error: Negative error code /* * @size: Structure size * @ns_id: Starting point for iteration; use 0 for first call, then * use the last returned ID for subsequent calls to paginate * @ns_type: Bitmask of namespace types to include (from enum ns_type): * 0: Return all namespace types * MNT_NS: Mount namespaces * NET_NS: Network namespaces * USER_NS: User namespaces * etc. Can be OR'd together * @user_ns_id: Filter results to namespaces owned by this user namespace: * 0: Return all namespaces (subject to permission checks) * LISTNS_CURRENT_USER: Namespaces owned by caller's user namespace * Other value: Namespaces owned by the specified user namespace ID */ struct ns_id_req { __u32 size; /* sizeof(struct ns_id_req) */ __u32 spare; /* Reserved, must be 0 */ __u64 ns_id; /* Last seen namespace ID (for pagination) */ __u32 ns_type; /* Filter by namespace type(s) */ __u32 spare2; /* Reserved, must be 0 */ __u64 user_ns_id; /* Filter by owning user namespace */ }; Example 1: List all namespaces void list_all_namespaces(void) { struct ns_id_req req = { .size = sizeof(req), .ns_id = 0, /* Start from beginning */ .ns_type = 0, /* All types */ .user_ns_id = 0, /* All user namespaces */ }; uint64_t ids[100]; ssize_t ret; printf("All namespaces in the system:\n"); do { ret = listns(&req, ids, 100, 0); if (ret < 0) { perror("listns"); break; } for (ssize_t i = 0; i < ret; i++) printf(" Namespace ID: %llu\n", (unsigned long long)ids[i]); /* Continue from last seen ID */ if (ret > 0) req.ns_id = ids[ret - 1]; } while (ret == 100); /* Buffer was full, more may exist */ } Example 2: List network namespaces only void list_network_namespaces(void) { struct ns_id_req req = { .size = sizeof(req), .ns_id = 0, .ns_type = NET_NS, /* Only network namespaces */ .user_ns_id = 0, }; uint64_t ids[100]; ssize_t ret; ret = listns(&req, ids, 100, 0); if (ret < 0) { perror("listns"); return; } printf("Network namespaces: %zd found\n", ret); for (ssize_t i = 0; i < ret; i++) printf(" netns ID: %llu\n", (unsigned long long)ids[i]); } Example 3: List namespaces owned by current user namespace void list_owned_namespaces(void) { struct ns_id_req req = { .size = sizeof(req), .ns_id = 0, .ns_type = 0, /* All types */ .user_ns_id = LISTNS_CURRENT_USER, /* Current userns */ }; uint64_t ids[100]; ssize_t ret; ret = listns(&req, ids, 100, 0); if (ret < 0) { perror("listns"); return; } printf("Namespaces owned by my user namespace: %zd\n", ret); for (ssize_t i = 0; i < ret; i++) printf(" ns ID: %llu\n", (unsigned long long)ids[i]); } Example 4: List multiple namespace types void list_network_and_mount_namespaces(void) { struct ns_id_req req = { .size = sizeof(req), .ns_id = 0, .ns_type = NET_NS | MNT_NS, /* Network and mount */ .user_ns_id = 0, }; uint64_t ids[100]; ssize_t ret; ret = listns(&req, ids, 100, 0); printf("Network and mount namespaces: %zd found\n", ret); } Example 5: Pagination through large namespace sets void list_all_with_pagination(void) { struct ns_id_req req = { .size = sizeof(req), .ns_id = 0, .ns_type = 0, .user_ns_id = 0, }; uint64_t ids[50]; size_t total = 0; ssize_t ret; printf("Enumerating all namespaces with pagination:\n"); while (1) { ret = listns(&req, ids, 50, 0); if (ret < 0) { perror("listns"); break; } if (ret == 0) break; /* No more namespaces */ total += ret; printf(" Batch: %zd namespaces\n", ret); /* Last ID in this batch becomes start of next batch */ req.ns_id = ids[ret - 1]; if (ret < 50) break; /* Partial batch = end of results */ } printf("Total: %zu namespaces\n", total); } Permission Model listns() respects namespace isolation and capabilities: (1) Global listing (user_ns_id = 0): - Requires CAP_SYS_ADMIN in the namespace's owning user namespace - OR the namespace must be in the caller's namespace context (e.g., a namespace the caller is currently using) - User namespaces additionally allow listing if the caller has CAP_SYS_ADMIN in that user namespace itself (2) Owner-filtered listing (user_ns_id != 0): - Requires CAP_SYS_ADMIN in the specified owner user namespace - OR the namespace must be in the caller's namespace context - This allows unprivileged processes to enumerate namespaces they own (3) Visibility: - Only "active" namespaces are listed - A namespace is active if it has a non-zero __ns_ref_active count - This includes namespaces used by running processes, held by open file descriptors, or kept active by bind mounts - Inactive namespaces (kept alive only by internal kernel references) are not visible via listns() Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-19-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- fs/nsfs.c | 39 ++++ include/linux/ns_common.h | 2 + include/linux/syscalls.h | 4 + include/linux/user_namespace.h | 4 +- include/uapi/linux/nsfs.h | 44 +++++ kernel/nscommon.c | 2 +- kernel/nstree.c | 397 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 489 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/nsfs.c b/fs/nsfs.c index 4a95a0a38f86..ba6c8975c82e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -471,6 +471,45 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return FILEID_NSFS; } +bool is_current_namespace(struct ns_common *ns) +{ + switch (ns->ns_type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + return current_in_namespace(to_cg_ns(ns)); +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + return current_in_namespace(to_ipc_ns(ns)); +#endif + case CLONE_NEWNS: + return current_in_namespace(to_mnt_ns(ns)); +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + return current_in_namespace(to_net_ns(ns)); +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + return current_in_namespace(to_pid_ns(ns)); +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + return current_in_namespace(to_time_ns(ns)); +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + return current_in_namespace(to_user_ns(ns)); +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + return current_in_namespace(to_uts_ns(ns)); +#endif + default: + VFS_WARN_ON_ONCE(true); + return false; + } +} + static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, int fh_len, int fh_type) { diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 3f05dd7d40c7..bd4492ef6ffc 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -129,8 +129,10 @@ struct ns_common { }; }; +bool is_current_namespace(struct ns_common *ns); int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); +struct ns_common *__must_check ns_owner(struct ns_common *ns); static __always_inline bool is_initial_namespace(struct ns_common *ns) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 66c06fcdfe19..cf84d98964b2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -77,6 +77,7 @@ struct cachestat_range; struct cachestat; struct statmount; struct mnt_id_req; +struct ns_id_req; struct xattr_args; struct file_attr; @@ -437,6 +438,9 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req, asmlinkage long sys_listmount(const struct mnt_id_req __user *req, u64 __user *mnt_ids, size_t nr_mnt_ids, unsigned int flags); +asmlinkage long sys_listns(const struct ns_id_req __user *req, + u64 __user *ns_ids, size_t nr_ns_ids, + unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); asmlinkage long sys_ftruncate(unsigned int fd, off_t length); #if BITS_PER_LONG == 32 diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 9a9aebbf96b9..9c3be157397e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -166,13 +166,13 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } -#ifdef CONFIG_USER_NS - static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } +#ifdef CONFIG_USER_NS + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index f8bc2aad74d6..a25e38d1c874 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -81,4 +81,48 @@ enum init_ns_id { #endif }; +enum ns_type { + TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */ + MNT_NS = (1ULL << 17), /* CLONE_NEWNS */ + CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */ + UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */ + IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */ + USER_NS = (1ULL << 28), /* CLONE_NEWUSER */ + PID_NS = (1ULL << 29), /* CLONE_NEWPID */ + NET_NS = (1ULL << 30), /* CLONE_NEWNET */ +}; + +/** + * struct ns_id_req - namespace ID request structure + * @size: size of this structure + * @spare: reserved for future use + * @filter: filter mask + * @ns_id: last namespace id + * @user_ns_id: owning user namespace ID + * + * Structure for passing namespace ID and miscellaneous parameters to + * statns(2) and listns(2). + * + * For statns(2) @param represents the request mask. + * For listns(2) @param represents the last listed mount id (or zero). + */ +struct ns_id_req { + __u32 size; + __u32 spare; + __u64 ns_id; + struct /* listns */ { + __u32 ns_type; + __u32 spare2; + __u64 user_ns_id; + }; +}; + +/* + * Special @user_ns_id value that can be passed to listns() + */ +#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */ + +/* List of all ns_id_req versions. */ +#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */ + #endif /* __LINUX_NSFS_H */ diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 4cbe1ecc8df0..6fe1c747fa46 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -98,7 +98,7 @@ void __ns_common_free(struct ns_common *ns) proc_free_inum(ns->inum); } -static struct ns_common *ns_owner(struct ns_common *ns) +struct ns_common *__must_check ns_owner(struct ns_common *ns) { struct user_namespace *owner; diff --git a/kernel/nstree.c b/kernel/nstree.c index dcad6a308547..4a8838683b6b 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -5,6 +5,7 @@ #include #include #include +#include #include static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); @@ -359,3 +360,399 @@ u64 __ns_tree_gen_id(struct ns_common *ns, u64 id) ns->ns_id = atomic64_inc_return(&namespace_cookie); return ns->ns_id; } + +struct klistns { + u64 __user *uns_ids; + u32 nr_ns_ids; + u64 last_ns_id; + u64 user_ns_id; + u32 ns_type; + struct user_namespace *user_ns; + bool userns_capable; + struct ns_common *first_ns; +}; + +static void __free_klistns_free(const struct klistns *kls) +{ + if (kls->user_ns_id != LISTNS_CURRENT_USER) + put_user_ns(kls->user_ns); + if (kls->first_ns && kls->first_ns->ops) + kls->first_ns->ops->put(kls->first_ns); +} + +#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS) + +static int copy_ns_id_req(const struct ns_id_req __user *req, + struct ns_id_req *kreq) +{ + int ret; + size_t usize; + + BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0); + + ret = get_user(usize, &req->size); + if (ret) + return -EFAULT; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < NS_ID_REQ_SIZE_VER0)) + return -EINVAL; + memset(kreq, 0, sizeof(*kreq)); + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); + if (ret) + return ret; + if (kreq->spare != 0) + return -EINVAL; + if (kreq->ns_type & ~NS_ALL) + return -EOPNOTSUPP; + return 0; +} + +static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq, + u64 __user *ns_ids, size_t nr_ns_ids) +{ + kls->last_ns_id = kreq->ns_id; + kls->user_ns_id = kreq->user_ns_id; + kls->nr_ns_ids = nr_ns_ids; + kls->ns_type = kreq->ns_type; + kls->uns_ids = ns_ids; + return 0; +} + +/* + * Lookup a namespace owned by owner with id >= ns_id. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner) +{ + struct ns_common *ret = NULL; + struct rb_node *node; + + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + read_seqlock_excl(&ns_tree_lock); + node = owner->ns_owner_tree.rb_node; + + while (node) { + struct ns_common *ns; + + ns = node_to_ns_owner(node); + if (ns_id <= ns->ns_id) { + ret = ns; + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + read_sequnlock_excl(&ns_tree_lock); + return ret; +} + +static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type) +{ + struct ns_common *ns; + + guard(rcu)(); + ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type); + if (!ns) + return NULL; + + if (!ns_get_unless_inactive(ns)) + return NULL; + + return ns; +} + +static inline bool __must_check ns_requested(const struct klistns *kls, + const struct ns_common *ns) +{ + return !kls->ns_type || (kls->ns_type & ns->ns_type); +} + +static inline bool __must_check may_list_ns(const struct klistns *kls, + struct ns_common *ns) +{ + if (kls->user_ns) { + if (kls->userns_capable) + return true; + } else { + struct ns_common *owner; + struct user_namespace *user_ns; + + owner = ns_owner(ns); + if (owner) + user_ns = to_user_ns(owner); + else + user_ns = &init_user_ns; + if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) + return true; + } + + if (is_current_namespace(ns)) + return true; + + if (ns->ns_type != CLONE_NEWUSER) + return false; + + if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) + return true; + + return false; +} + +static void __ns_put(struct ns_common *ns) +{ + if (ns->ops) + ns->ops->put(ns); +} + +DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T)) + +static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, + struct ns_common *candidate) +{ + struct ns_common *ns __free(ns_put) = NULL; + + if (!ns_requested(kls, candidate)) + return NULL; + + ns = ns_get_unless_inactive(candidate); + if (!ns) + return NULL; + + if (!may_list_ns(kls, ns)) + return NULL; + + return no_free_ptr(ns); +} + +static ssize_t do_listns_userns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns = NULL, *first_ns = NULL; + const struct list_head *head; + ssize_t ret; + + VFS_WARN_ON_ONCE(!kls->user_ns_id); + + if (kls->user_ns_id == LISTNS_CURRENT_USER) + ns = to_ns_common(current_user_ns()); + else if (kls->user_ns_id) + ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER); + if (!ns) + return -EINVAL; + kls->user_ns = to_user_ns(ns); + + /* + * Use the rbtree to find the first namespace we care about and + * then use it's list entry to iterate from there. + */ + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + head = &to_ns_common(kls->user_ns)->ns_owner; + kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); + + rcu_read_lock(); + + if (!first_ns) + first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry); + for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids; + ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) { + struct ns_common *valid __free(ns_put); + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + if (put_user(valid->ns_id, ns_ids + ret)) + return -EINVAL; + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + return ret; +} + +/* + * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) +{ + struct ns_common *ret = NULL; + struct ns_tree *ns_tree = NULL; + struct rb_node *node; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + } + + read_seqlock_excl(&ns_tree_lock); + if (ns_tree) + node = ns_tree->ns_tree.rb_node; + else + node = ns_unified_tree.rb_node; + + while (node) { + struct ns_common *ns; + + if (ns_type) + ns = node_to_ns(node); + else + ns = node_to_ns_unified(node); + + if (ns_id <= ns->ns_id) { + if (ns_type) + ret = node_to_ns(node); + else + ret = node_to_ns_unified(node); + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + read_sequnlock_excl(&ns_tree_lock); + return ret; +} + +static inline struct ns_common *first_ns_common(const struct list_head *head, + struct ns_tree *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(head->next, struct ns_common, ns_list_node); + return list_entry_rcu(head->next, struct ns_common, ns_unified_list_node); +} + +static inline struct ns_common *next_ns_common(struct ns_common *ns, + struct ns_tree *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(ns->ns_list_node.next, struct ns_common, ns_list_node); + return list_entry_rcu(ns->ns_unified_list_node.next, struct ns_common, ns_unified_list_node); +} + +static inline bool ns_common_is_head(struct ns_common *ns, + const struct list_head *head, + struct ns_tree *ns_tree) +{ + if (ns_tree) + return &ns->ns_list_node == head; + return &ns->ns_unified_list_node == head; +} + +static ssize_t do_listns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns, *first_ns = NULL; + struct ns_tree *ns_tree = NULL; + const struct list_head *head; + u32 ns_type; + ssize_t ret; + + if (hweight32(kls->ns_type) == 1) + ns_type = kls->ns_type; + else + ns_type = 0; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return -EINVAL; + } + + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + if (ns_tree) + head = &ns_tree->ns_list; + else + head = &ns_unified_list; + + rcu_read_lock(); + + if (!first_ns) + first_ns = first_ns_common(head, ns_tree); + + for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; + ns = next_ns_common(ns, ns_tree)) { + struct ns_common *valid __free(ns_put); + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + if (put_user(valid->ns_id, ns_ids + ret)) + return -EINVAL; + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + return ret; +} + +SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req, + u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags) +{ + struct klistns klns __free(klistns_free) = {}; + const size_t maxcount = 1000000; + struct ns_id_req kreq; + ssize_t ret; + + if (flags) + return -EINVAL; + + if (unlikely(nr_ns_ids > maxcount)) + return -EOVERFLOW; + + if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids))) + return -EFAULT; + + ret = copy_ns_id_req(req, &kreq); + if (ret) + return ret; + + ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids); + if (ret) + return ret; + + if (kreq.user_ns_id) + return do_listns_userns(&klns); + + return do_listns(&klns); +} -- cgit v1.2.3 From 16dad7801aad73138a2dff5ea950130646914d1f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Oct 2025 20:19:15 -1000 Subject: cgroup: Rename cgroup lifecycle hooks to cgroup_task_*() The current names cgroup_exit(), cgroup_release(), and cgroup_free() are confusing because they look like they're operating on cgroups themselves when they're actually task lifecycle hooks. For example, cgroup_init() initializes the cgroup subsystem while cgroup_exit() is a task exit notification to cgroup. Rename them to cgroup_task_exit(), cgroup_task_release(), and cgroup_task_free() to make it clear that these operate on tasks. Cc: Dan Schatzberg Cc: Peter Zijlstra Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 12 ++++++------ kernel/cgroup/cgroup.c | 11 ++++++----- kernel/exit.c | 4 ++-- kernel/fork.c | 2 +- kernel/sched/autogroup.c | 4 ++-- 5 files changed, 17 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6ed477338b16..4068035176c4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -137,9 +137,9 @@ extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); -void cgroup_exit(struct task_struct *p); -void cgroup_release(struct task_struct *p); -void cgroup_free(struct task_struct *p); +void cgroup_task_exit(struct task_struct *p); +void cgroup_task_release(struct task_struct *p); +void cgroup_task_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); @@ -680,9 +680,9 @@ static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} -static inline void cgroup_exit(struct task_struct *p) {} -static inline void cgroup_release(struct task_struct *p) {} -static inline void cgroup_free(struct task_struct *p) {} +static inline void cgroup_task_exit(struct task_struct *p) {} +static inline void cgroup_task_release(struct task_struct *p) {} +static inline void cgroup_task_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6ae5f48cf64e..826b7fd2f85d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -944,7 +944,8 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit()/cgroup_free() dropping the css_set. + * against cgroup_task_exit()/cgroup_task_free() dropping + * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -6972,13 +6973,13 @@ void cgroup_post_fork(struct task_struct *child, } /** - * cgroup_exit - detach cgroup from exiting task + * cgroup_task_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ -void cgroup_exit(struct task_struct *tsk) +void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; @@ -7010,7 +7011,7 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_release(struct task_struct *task) +void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; @@ -7027,7 +7028,7 @@ void cgroup_release(struct task_struct *task) } } -void cgroup_free(struct task_struct *task) +void cgroup_task_free(struct task_struct *task) { struct css_set *cset = task_css_set(task); put_css_set(cset); diff --git a/kernel/exit.c b/kernel/exit.c index 9f74e8f1c431..46173461e8de 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -257,7 +257,7 @@ repeat: rcu_read_unlock(); pidfs_exit(p); - cgroup_release(p); + cgroup_task_release(p); /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ thread_pid = task_pid(p); @@ -967,7 +967,7 @@ void __noreturn do_exit(long code) exit_thread(tsk); sched_autogroup_exit_task(tsk); - cgroup_exit(tsk); + cgroup_task_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..960c39c9c264 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -738,7 +738,7 @@ void __put_task_struct(struct task_struct *tsk) unwind_task_free(tsk); sched_ext_free(tsk); io_uring_free(tsk); - cgroup_free(tsk); + cgroup_task_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index cdea931aae30..954137775f38 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -178,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * this process can already run with task_group() == prev->tg or we can * race with cgroup code which can read autogroup = prev under rq->lock. * In the latter case for_each_thread() can not miss a migrating thread, - * cpu_cgroup_attach() must not be possible after cgroup_exit() and it - * can't be removed from thread list, we hold ->siglock. + * cpu_cgroup_attach() must not be possible after cgroup_task_exit() + * and it can't be removed from thread list, we hold ->siglock. * * If an exiting thread was already removed from thread list we rely on * sched_autogroup_exit_task(). -- cgit v1.2.3 From d245698d727ab8f5420b3e28d1243f96a5234851 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Oct 2025 20:19:17 -1000 Subject: cgroup: Defer task cgroup unlink until after the task is done switching out When a task exits, css_set_move_task(tsk, cset, NULL, false) unlinks the task from its cgroup. From the cgroup's perspective, the task is now gone. If this makes the cgroup empty, it can be removed, triggering ->css_offline() callbacks that notify controllers the cgroup is going offline resource-wise. However, the exiting task can still run, perform memory operations, and schedule until the final context switch in finish_task_switch(). This creates a confusing situation where controllers are told a cgroup is offline while resource activities are still happening in it. While this hasn't broken existing controllers, it has caused direct confusion for sched_ext schedulers. Split cgroup_task_exit() into two functions. cgroup_task_exit() now only calls the subsystem exit callbacks and continues to be called from do_exit(). The css_set cleanup is moved to the new cgroup_task_dead() which is called from finish_task_switch() after the final context switch, so that the cgroup only appears empty after the task is truly done running. This also reorders operations so that subsys->exit() is now called before unlinking from the cgroup, which shouldn't break anything. Cc: Dan Schatzberg Cc: Peter Zijlstra Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 ++ kernel/cgroup/cgroup.c | 23 ++++++++++++++--------- kernel/sched/core.c | 2 ++ 3 files changed, 18 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4068035176c4..bc892e3b37ee 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -138,6 +138,7 @@ extern void cgroup_cancel_fork(struct task_struct *p, extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_task_exit(struct task_struct *p); +void cgroup_task_dead(struct task_struct *p); void cgroup_task_release(struct task_struct *p); void cgroup_task_free(struct task_struct *p); @@ -681,6 +682,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p, static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_task_exit(struct task_struct *p) {} +static inline void cgroup_task_dead(struct task_struct *p) {} static inline void cgroup_task_release(struct task_struct *p) {} static inline void cgroup_task_free(struct task_struct *p) {} diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b3c27900c5d2..aae180d56c8c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -944,7 +944,7 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_task_exit()/cgroup_task_free() dropping + * against cgroup_task_dead()/cgroup_task_free() dropping * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -6982,10 +6982,20 @@ void cgroup_post_fork(struct task_struct *child, void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; - struct css_set *cset; int i; - spin_lock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ + do_each_subsys_mask(ss, i, have_exit_callback) { + ss->exit(tsk); + } while_each_subsys_mask(); +} + +void cgroup_task_dead(struct task_struct *tsk) +{ + struct css_set *cset; + unsigned long flags; + + spin_lock_irqsave(&css_set_lock, flags); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); @@ -7003,12 +7013,7 @@ void cgroup_task_exit(struct task_struct *tsk) test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); - - /* see cgroup_post_fork() for details */ - do_each_subsys_mask(ss, i, have_exit_callback) { - ss->exit(tsk); - } while_each_subsys_mask(); + spin_unlock_irqrestore(&css_set_lock, flags); } void cgroup_task_release(struct task_struct *task) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ebf67b48e2..40f12e37f60f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5222,6 +5222,8 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); + cgroup_task_dead(prev); + /* Task is done with its stack. */ put_task_stack(prev); -- cgit v1.2.3 From 7900aa699c34401cf5d0c701d9ef72880ddc1a83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Nov 2025 10:25:13 -1000 Subject: sched_ext: Fix cgroup exit ordering by moving sched_ext_free() to finish_task_switch() sched_ext_free() was called from __put_task_struct() when the last reference to the task is dropped, which could be long after the task has finished running. This causes cgroup-related problems: - ops.init_task() can be called on a cgroup which didn't get ops.cgroup_init()'d during scheduler load, because the cgroup might be destroyed/unlinked while the zombie or dead task is still lingering on the scx_tasks list. - ops.cgroup_exit() could be called before ops.exit_task() is called on all member tasks, leading to incorrect exit ordering. Fix by moving it to finish_task_switch() to be called right after the final context switch away from the dying task, matching when sched_class->task_dead() is called. Rename it to sched_ext_dead() to match the new calling context. By calling sched_ext_dead() before cgroup_task_dead(), we ensure that: - Tasks visible on scx_tasks list have valid cgroups during scheduler load, as cgroup_mutex prevents cgroup destruction while the task is still linked. - All member tasks have ops.exit_task() called and are removed from scx_tasks before the cgroup can be destroyed and trigger ops.cgroup_exit(). This fix is made possible by the cgroup_task_dead() split in the previous patch. This also makes more sense resource-wise as there's no point in keeping scheduler side resources around for dead tasks. Reported-by: Dan Schatzberg Cc: Peter Zijlstra Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 4 ++-- kernel/fork.c | 1 - kernel/sched/core.c | 6 ++++++ kernel/sched/ext.c | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 4713f374acc0..eb776b094d36 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -208,14 +208,14 @@ struct sched_ext_entity { struct list_head tasks_node; }; -void sched_ext_free(struct task_struct *p); +void sched_ext_dead(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); void scx_softlockup(u32 dur_s); bool scx_rcu_cpu_stall(void); #else /* !CONFIG_SCHED_CLASS_EXT */ -static inline void sched_ext_free(struct task_struct *p) {} +static inline void sched_ext_dead(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} static inline bool scx_rcu_cpu_stall(void) { return false; } diff --git a/kernel/fork.c b/kernel/fork.c index 960c39c9c264..5ae37909a813 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -736,7 +736,6 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); unwind_task_free(tsk); - sched_ext_free(tsk); io_uring_free(tsk); cgroup_task_free(tsk); task_numa_free(tsk, true); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0324457622d7..3f4653106216 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5151,6 +5151,12 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); + /* + * sched_ext_dead() must come before cgroup_task_dead() to + * prevent cgroups from being removed while its member tasks are + * visible to SCX schedulers. + */ + sched_ext_dead(prev); cgroup_task_dead(prev); /* Task is done with its stack. */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index d1ef5bda95ae..2811e4f42a37 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2966,7 +2966,7 @@ void scx_cancel_fork(struct task_struct *p) percpu_up_read(&scx_fork_rwsem); } -void sched_ext_free(struct task_struct *p) +void sched_ext_dead(struct task_struct *p) { unsigned long flags; -- cgit v1.2.3 From c18d4b190a46651726c9a952667c74d2deb33c28 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Tue, 28 Oct 2025 20:30:05 +0000 Subject: net: Extend NAPI threaded polling to allow kthread based busy polling Add a new state NAPI_STATE_THREADED_BUSY_POLL to the NAPI state enum to enable and disable threaded busy polling. When threaded busy polling is enabled for a NAPI, enable NAPI_STATE_THREADED also. When the threaded NAPI is scheduled, set NAPI_STATE_IN_BUSY_POLL to signal napi_complete_done not to rearm interrupts. Whenever NAPI_STATE_THREADED_BUSY_POLL is unset, the NAPI_STATE_IN_BUSY_POLL will be unset, napi_complete_done unsets the NAPI_STATE_SCHED_THREADED bit also, which in turn will make the kthread go to sleep. Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn Acked-by: Martin Karsten Tested-by: Martin Karsten Link: https://patch.msgid.link/20251028203007.575686-2-skhawaja@google.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 5 +-- Documentation/networking/napi.rst | 50 +++++++++++++++++++++++++++- include/linux/netdevice.h | 4 ++- include/uapi/linux/netdev.h | 1 + net/core/dev.c | 58 +++++++++++++++++++++++++++------ net/core/dev.h | 3 ++ net/core/netdev-genl-gen.c | 2 +- tools/include/uapi/linux/netdev.h | 1 + 8 files changed, 109 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index e00d3fa1c152..10c412b7433f 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -88,7 +88,7 @@ definitions: - name: napi-threaded type: enum - entries: [disabled, enabled] + entries: [disabled, enabled, busy-poll] attribute-sets: - @@ -291,7 +291,8 @@ attribute-sets: name: threaded doc: Whether the NAPI is configured to operate in threaded polling mode. If this is set to enabled then the NAPI context operates - in threaded polling mode. + in threaded polling mode. If this is set to busy-poll, then the + threaded polling mode also busy polls. type: u32 enum: napi-threaded - diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst index 7dd60366f4ff..4e008efebb35 100644 --- a/Documentation/networking/napi.rst +++ b/Documentation/networking/napi.rst @@ -263,7 +263,9 @@ are not well known). Busy polling is enabled by either setting ``SO_BUSY_POLL`` on selected sockets or using the global ``net.core.busy_poll`` and ``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling -also exists. +also exists. Threaded polling of NAPI also has a mode to busy poll for +packets (:ref:`threaded busy polling`) using the NAPI +processing kthread. epoll-based busy polling ------------------------ @@ -426,6 +428,52 @@ Therefore, setting ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` is the recommended usage, because otherwise setting ``irq-suspend-timeout`` might not have any discernible effect. +.. _threaded_busy_poll: + +Threaded NAPI busy polling +-------------------------- + +Threaded NAPI busy polling extends threaded NAPI and adds support to do +continuous busy polling of the NAPI. This can be useful for forwarding or +AF_XDP applications. + +Threaded NAPI busy polling can be enabled on per NIC queue basis using Netlink. + +For example, using the following script: + +.. code-block:: bash + + $ ynl --family netdev --do napi-set \ + --json='{"id": 66, "threaded": "busy-poll"}' + +The kernel will create a kthread that busy polls on this NAPI. + +The user may elect to set the CPU affinity of this kthread to an unused CPU +core to improve how often the NAPI is polled at the expense of wasted CPU +cycles. Note that this will keep the CPU core busy with 100% usage. + +Once threaded busy polling is enabled for a NAPI, PID of the kthread can be +retrieved using Netlink so the affinity of the kthread can be set up. + +For example, the following script can be used to fetch the PID: + +.. code-block:: bash + + $ ynl --family netdev --do napi-get --json='{"id": 66}' + +This will output something like following, the pid `258` is the PID of the +kthread that is polling this NAPI. + +.. code-block:: bash + + $ {'defer-hard-irqs': 0, + 'gro-flush-timeout': 0, + 'id': 66, + 'ifindex': 2, + 'irq-suspend-timeout': 0, + 'pid': 258, + 'threaded': 'busy-poll'} + .. _threaded: Threaded NAPI diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9c1e5042c5e7..e808071dbb7d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -423,11 +423,12 @@ enum { NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_LISTED, /* NAPI added to system lists */ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ - NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ + NAPI_STATE_IN_BUSY_POLL, /* Do not rearm NAPI interrupt */ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */ + NAPI_STATE_THREADED_BUSY_POLL, /* The threaded NAPI poller will busy poll */ }; enum { @@ -442,6 +443,7 @@ enum { NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER), + NAPIF_STATE_THREADED_BUSY_POLL = BIT(NAPI_STATE_THREADED_BUSY_POLL), }; enum gro_result { diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 48eb49aa03d4..048c8de1a130 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -80,6 +80,7 @@ enum netdev_qstats_scope { enum netdev_napi_threaded { NETDEV_NAPI_THREADED_DISABLED, NETDEV_NAPI_THREADED_ENABLED, + NETDEV_NAPI_THREADED_BUSY_POLL, }; enum { diff --git a/net/core/dev.c b/net/core/dev.c index dccc1176f3c6..2c1de5fb97d9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7089,7 +7089,8 @@ static void napi_stop_kthread(struct napi_struct *napi) */ if ((val & NAPIF_STATE_SCHED_THREADED) || !(val & NAPIF_STATE_SCHED)) { - new = val & (~NAPIF_STATE_THREADED); + new = val & (~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL)); } else { msleep(20); continue; @@ -7113,6 +7114,16 @@ static void napi_stop_kthread(struct napi_struct *napi) napi->thread = NULL; } +static void napi_set_threaded_state(struct napi_struct *napi, + enum netdev_napi_threaded threaded_mode) +{ + bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED; + bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL; + + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll); +} + int napi_set_threaded(struct napi_struct *napi, enum netdev_napi_threaded threaded) { @@ -7139,7 +7150,7 @@ int napi_set_threaded(struct napi_struct *napi, } else { /* Make sure kthread is created before THREADED bit is set. */ smp_mb__before_atomic(); - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + napi_set_threaded_state(napi, threaded); } return 0; @@ -7531,7 +7542,9 @@ void napi_disable_locked(struct napi_struct *n) } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; - new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); + new &= ~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL | + NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); @@ -7743,7 +7756,7 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void napi_threaded_poll_loop(struct napi_struct *napi) +static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; @@ -7772,22 +7785,47 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) } skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); + + /* When busy poll is enabled, the old packets are not flushed in + * napi_complete_done. So flush them here. + */ + if (busy_poll) + gro_flush_normal(&napi->gro, HZ >= 1000); local_bh_enable(); + /* Call cond_resched here to avoid watchdog warnings. */ + if (repoll || busy_poll) { + rcu_softirq_qs_periodic(last_qs); + cond_resched(); + } + if (!repoll) break; - - rcu_softirq_qs_periodic(last_qs); - cond_resched(); } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; + bool want_busy_poll; + bool in_busy_poll; + unsigned long val; + + while (!napi_thread_wait(napi)) { + val = READ_ONCE(napi->state); + + want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL; + in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL; - while (!napi_thread_wait(napi)) - napi_threaded_poll_loop(napi); + if (unlikely(val & NAPIF_STATE_DISABLE)) + want_busy_poll = false; + + if (want_busy_poll != in_busy_poll) + assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state, + want_busy_poll); + + napi_threaded_poll_loop(napi, want_busy_poll); + } return 0; } @@ -13097,7 +13135,7 @@ static void run_backlog_napi(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); - napi_threaded_poll_loop(&sd->backlog); + napi_threaded_poll_loop(&sd->backlog, false); } static void backlog_napi_setup(unsigned int cpu) diff --git a/net/core/dev.h b/net/core/dev.h index 900880e8b5b4..4d872a79bafb 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -317,6 +317,9 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) { + if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state)) + return NETDEV_NAPI_THREADED_BUSY_POLL; + if (test_bit(NAPI_STATE_THREADED, &n->state)) return NETDEV_NAPI_THREADED_ENABLED; diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index e9a2a6f26cb7..ff20435c45d2 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -97,7 +97,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, - [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2), }; /* NETDEV_CMD_BIND_TX - do */ diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 48eb49aa03d4..048c8de1a130 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -80,6 +80,7 @@ enum netdev_qstats_scope { enum netdev_napi_threaded { NETDEV_NAPI_THREADED_DISABLED, NETDEV_NAPI_THREADED_ENABLED, + NETDEV_NAPI_THREADED_BUSY_POLL, }; enum { -- cgit v1.2.3 From abcf6eef90c6e47efed62a7c233ffc1a6a90797e Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 27 Oct 2025 13:27:58 +0100 Subject: net: phy: introduce internal API for PHY MSE diagnostics Add the base infrastructure for Mean Square Error (MSE) diagnostics, as proposed by the OPEN Alliance "Advanced diagnostic features for 100BASE-T1 automotive Ethernet PHYs" [1] specification. The OPEN Alliance spec defines only average MSE and average peak MSE over a fixed number of symbols. However, other PHYs, such as the KSZ9131, additionally expose a worst-peak MSE value latched since the last channel capture. This API accounts for such vendor extensions by adding a distinct capability bit and snapshot field. Channel-to-pair mapping is normally straightforward, but in some cases (e.g. 100BASE-TX with MDI-X resolution unknown) the mapping is ambiguous. If hardware does not expose MDI-X status, the exact pair cannot be determined. To avoid returning misleading per-channel data in this case, a LINK selector is defined for aggregate MSE measurements. All investigated devices differ in MSE capabilities, such as sample rate, number of analyzed symbols, and scaling factors. For example, the KSZ9131 uses different scaling for MSE and pMSE. To make this visible to callers, scale limits and timing information are returned via get_mse_capability(). Some PHYs sample very few symbols at high frequency (e.g. 2 us update rate). To cover such cases and allow for future high-speed PHYs with even shorter intervals, the refresh rate is reported as u64 in picoseconds. This patch introduces the internal PHY API for Mean Square Error diagnostics. It defines new kernel-side data types and driver hooks: - struct phy_mse_capability: describes supported metrics, scale limits, update interval, and sampling length. - struct phy_mse_snapshot: holds one correlated measurement set. - New phy_driver ops: `get_mse_capability()` and `get_mse_snapshot()`. These definitions form the core kernel API. No user-visible interfaces are added in this commit. Standardization notes: OPEN Alliance defines presence and interpretation of some metrics but does not fix numeric scales or sampling internals: - SQI (3-bit, 0..7) is mandatory; correlation to SNR/BER is informative (OA 100BASE-T1 TC1 v1.0 6.1.2; OA 1000BASE-T1 TC12 v2.2 6.1.2). - MSE is optional; OA recommends 2^16 symbols and scaling to 0..511, with a worst-case latch since last read (OA 100BASE-T1 TC1 v1.0 6.1.1; OA 1000BASE-T1 TC12 v2.2 6.1.1). Refresh is recommended (~0.8-2.0 ms for 100BASE-T1; ~80-200 us for 1000BASE-T1). Exact scaling/time windows are vendor-specific. - Peak MSE (pMSE) is defined only for 100BASE-T1 as optional, e.g. 128-symbol sliding window with 8-bit range and worst-case latch (OA 100BASE-T1 TC1 v1.0 6.1.3). Therefore this API exposes which measures and selectors a PHY supports, and documents where behavior is standard-referenced vs vendor-specific. [1] Signed-off-by: Oleksij Rempel Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20251027122801.982364-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 358dd6f0ff96..e3474f03cbc1 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -903,6 +903,165 @@ struct phy_led { #define to_phy_led(d) container_of(d, struct phy_led, led_cdev) +/* + * PHY_MSE_CAP_* - Bitmask flags for Mean Square Error (MSE) capabilities + * + * These flags describe which MSE metrics and selectors are implemented + * by the PHY for the current link mode. They are used in + * struct phy_mse_capability.supported_caps. + * + * Standardization: + * The OPEN Alliance (OA) defines the presence of MSE/SQI/pMSE but not their + * numeric scaling, update intervals, or aggregation windows. See: + * OA 100BASE-T1 TC1 v1.0, sections 6.1.1-6.1.3 + * OA 1000BASE-T1 TC12 v2.2, sections 6.1.1-6.1.2 + * + * Description of flags: + * + * PHY_MSE_CAP_CHANNEL_A + * Per-pair diagnostics for Channel A are supported. Mapping to the + * physical wire pair may depend on MDI/MDI-X polarity. + * + * PHY_MSE_CAP_CHANNEL_B, _C, _D + * Same as above for channels B-D. + * + * PHY_MSE_CAP_WORST_CHANNEL + * The PHY or driver can identify and report the single worst-performing + * channel without querying each one individually. + * + * PHY_MSE_CAP_LINK + * The PHY provides only a link-wide aggregate measurement or cannot map + * results to a specific pair (for example 100BASE-TX with unknown + * MDI/MDI-X). + * + * PHY_MSE_CAP_AVG + * Average MSE (mean DCQ metric) is supported. For 100/1000BASE-T1 the OA + * recommends 2^16 symbols, scaled 0..511, but the exact scaling is + * vendor-specific. + * + * PHY_MSE_CAP_PEAK + * Peak MSE (current peak within the measurement window) is supported. + * Defined as pMSE for 100BASE-T1; vendor-specific for others. + * + * PHY_MSE_CAP_WORST_PEAK + * Latched worst-case peak MSE since the last read (read-to-clear if + * implemented). Optional in OA 100BASE-T1 TC1 6.1.3. + */ +#define PHY_MSE_CAP_CHANNEL_A BIT(0) +#define PHY_MSE_CAP_CHANNEL_B BIT(1) +#define PHY_MSE_CAP_CHANNEL_C BIT(2) +#define PHY_MSE_CAP_CHANNEL_D BIT(3) +#define PHY_MSE_CAP_WORST_CHANNEL BIT(4) +#define PHY_MSE_CAP_LINK BIT(5) +#define PHY_MSE_CAP_AVG BIT(6) +#define PHY_MSE_CAP_PEAK BIT(7) +#define PHY_MSE_CAP_WORST_PEAK BIT(8) + +/* + * enum phy_mse_channel - Identifiers for selecting MSE measurement channels + * + * PHY_MSE_CHANNEL_A - PHY_MSE_CHANNEL_D + * Select per-pair measurement for the corresponding channel. + * + * PHY_MSE_CHANNEL_WORST + * Select the single worst-performing channel reported by hardware. + * + * PHY_MSE_CHANNEL_LINK + * Select link-wide aggregate data (used when per-pair results are + * unavailable). + */ +enum phy_mse_channel { + PHY_MSE_CHANNEL_A, + PHY_MSE_CHANNEL_B, + PHY_MSE_CHANNEL_C, + PHY_MSE_CHANNEL_D, + PHY_MSE_CHANNEL_WORST, + PHY_MSE_CHANNEL_LINK, +}; + +/** + * struct phy_mse_capability - Capabilities of Mean Square Error (MSE) + * measurement interface + * + * Standardization notes: + * + * - Presence of MSE/SQI/pMSE is defined by OPEN Alliance specs, but numeric + * scaling, refresh/update rate and aggregation windows are not fixed and + * are vendor-/product-specific. (OA 100BASE-T1 TC1 v1.0 6.1.*; + * OA 1000BASE-T1 TC12 v2.2 6.1.*) + * + * - Typical recommendations: 2^16 symbols and 0..511 scaling for MSE; pMSE only + * defined for 100BASE-T1 (sliding window example), others are vendor + * extensions. Drivers must report actual scale/limits here. + * + * Describes the MSE measurement capabilities for the current link mode. These + * properties are dynamic and may change when link settings are modified. + * Callers should re-query this capability after any link state change to + * ensure they have the most up-to-date information. + * + * Callers should only request measurements for channels and types that are + * indicated as supported by the @supported_caps bitmask. If @supported_caps + * is 0, the device provides no MSE diagnostics, and driver operations should + * typically return -EOPNOTSUPP. + * + * Snapshot values for average and peak MSE can be normalized to a 0..1 ratio + * by dividing the raw snapshot by the corresponding @max_average_mse or + * @max_peak_mse value. + * + * @max_average_mse: The maximum value for an average MSE snapshot. This + * defines the scale for the measurement. If the PHY_MSE_CAP_AVG capability is + * supported, this value MUST be greater than 0. (vendor-specific units). + * @max_peak_mse: The maximum value for a peak MSE snapshot. If either + * PHY_MSE_CAP_PEAK or PHY_MSE_CAP_WORST_PEAK is supported, this value MUST + * be greater than 0. (vendor-specific units). + * @refresh_rate_ps: The typical interval, in picoseconds, between hardware + * updates of the MSE values. This is an estimate, and callers should not + * assume synchronous sampling. (vendor-specific units). + * @num_symbols: The number of symbols aggregated per hardware sample to + * calculate the MSE. (vendor-specific units). + * @supported_caps: A bitmask of PHY_MSE_CAP_* values indicating which + * measurement types (e.g., average, peak) and channels + * (e.g., per-pair or link-wide) are supported. + */ +struct phy_mse_capability { + u64 max_average_mse; + u64 max_peak_mse; + u64 refresh_rate_ps; + u64 num_symbols; + u32 supported_caps; +}; + +/** + * struct phy_mse_snapshot - A snapshot of Mean Square Error (MSE) diagnostics + * + * Holds a set of MSE diagnostic values that were all captured from a single + * measurement window. + * + * Values are raw, device-scaled and not normalized. Use struct + * phy_mse_capability to interpret the scale and sampling window. + * + * @average_mse: The average MSE value over the measurement window. + * OPEN Alliance references MSE as a DCQ metric; recommends 2^16 symbols and + * 0..511 scaling. Exact scale and refresh are vendor-specific. + * (100BASE-T1 TC1 v1.0 6.1.1; 1000BASE-T1 TC12 v2.2 6.1.1). + * + * @peak_mse: The peak MSE value observed within the measurement window. + * For 100BASE-T1, "pMSE" is optional and may be implemented via a sliding + * 128-symbol window with periodic capture; not standardized for 1000BASE-T1. + * (100BASE-T1 TC1 v1.0 6.1.3, Table "DCQ.peakMSE"). + * + * @worst_peak_mse: A latched high-water mark of the peak MSE since last read + * (read-to-clear if implemented). OPEN Alliance shows a latched "worst case + * peak MSE" for 100BASE-T1 pMSE; availability/semantics outside that are + * vendor-specific. (100BASE-T1 TC1 v1.0 6.1.3, DCQ.peakMSE high byte; + * 1000BASE-T1 TC12 v2.2 treats DCQ details as vendor-specific.) + */ +struct phy_mse_snapshot { + u64 average_mse; + u64 peak_mse; + u64 worst_peak_mse; +}; + /** * struct phy_driver - Driver structure for a particular PHY type * @@ -1184,6 +1343,53 @@ struct phy_driver { /** @get_sqi_max: Get the maximum signal quality indication */ int (*get_sqi_max)(struct phy_device *dev); + /** + * @get_mse_capability: Get capabilities and scale of MSE measurement + * @dev: PHY device + * @cap: Output (filled on success) + * + * Fill @cap with the PHY's MSE capability for the current + * link mode: scale limits (max_average_mse, max_peak_mse), update + * interval (refresh_rate_ps), sample length (num_symbols) and the + * capability bitmask (supported_caps). + * + * Implementations may defer capability report until hardware has + * converged; in that case they should return -EAGAIN and allow the + * caller to retry later. + * + * Return: 0 on success. On failure, returns a negative errno code, such + * as -EOPNOTSUPP if MSE measurement is not supported by the PHY or in + * the current link mode, or -EAGAIN if the capability information is + * not yet available. + */ + int (*get_mse_capability)(struct phy_device *dev, + struct phy_mse_capability *cap); + + /** + * @get_mse_snapshot: Retrieve a snapshot of MSE diagnostic values + * @dev: PHY device + * @channel: Channel identifier (PHY_MSE_CHANNEL_*) + * @snapshot: Output (filled on success) + * + * Fill @snapshot with a correlated set of MSE values from the most + * recent measurement window. + * + * Callers must validate @channel against supported_caps returned by + * get_mse_capability(). Drivers must not coerce @channel; if the + * requested selector is not implemented by the device or current link + * mode, the operation must fail. + * + * worst_peak_mse is latched and must be treated as read-to-clear. + * + * Return: 0 on success. On failure, returns a negative errno code, such + * as -EOPNOTSUPP if MSE measurement is not supported by the PHY or in + * the current link mode, or -EAGAIN if measurements are not yet + * available. + */ + int (*get_mse_snapshot)(struct phy_device *dev, + enum phy_mse_channel channel, + struct phy_mse_snapshot *snapshot); + /* PLCA RS interface */ /** @get_plca_cfg: Return the current PLCA configuration */ int (*get_plca_cfg)(struct phy_device *dev, -- cgit v1.2.3 From 85d55d8cc3ef7f77b249c97e9fac6a0fc5f5daa7 Mon Sep 17 00:00:00 2001 From: Akhil P Oommen Date: Tue, 30 Sep 2025 11:18:06 +0530 Subject: soc: qcom: ubwc: Add config for Kaanapali Add the ubwc configuration for Kaanapali chipset. This chipset brings support for UBWC v6 version. The rest of the configurations remains as usual. Signed-off-by: Akhil P Oommen Reviewed-by: Konrad Dybcio Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250930-kaana-gpu-support-v1-1-73530b0700ed@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/ubwc_config.c | 11 +++++++++++ include/linux/soc/qcom/ubwc.h | 1 + 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/soc/qcom/ubwc_config.c b/drivers/soc/qcom/ubwc_config.c index 942fe6c17612..1c09796163b0 100644 --- a/drivers/soc/qcom/ubwc_config.c +++ b/drivers/soc/qcom/ubwc_config.c @@ -16,6 +16,16 @@ static const struct qcom_ubwc_cfg_data no_ubwc_data = { /* no UBWC, no HBB */ }; +static const struct qcom_ubwc_cfg_data kaanapali_data = { + .ubwc_enc_version = UBWC_6_0, + .ubwc_dec_version = UBWC_6_0, + .ubwc_swizzle = UBWC_SWIZZLE_ENABLE_LVL2 | + UBWC_SWIZZLE_ENABLE_LVL3, + .ubwc_bank_spread = true, + .highest_bank_bit = 16, + .macrotile_mode = true, +}; + static const struct qcom_ubwc_cfg_data msm8937_data = { .ubwc_enc_version = UBWC_1_0, .ubwc_dec_version = UBWC_1_0, @@ -234,6 +244,7 @@ static const struct of_device_id qcom_ubwc_configs[] __maybe_unused = { { .compatible = "qcom,apq8026", .data = &no_ubwc_data }, { .compatible = "qcom,apq8074", .data = &no_ubwc_data }, { .compatible = "qcom,apq8096", .data = &msm8998_data }, + { .compatible = "qcom,kaanapali", .data = &kaanapali_data, }, { .compatible = "qcom,glymur", .data = &glymur_data}, { .compatible = "qcom,msm8226", .data = &no_ubwc_data }, { .compatible = "qcom,msm8916", .data = &no_ubwc_data }, diff --git a/include/linux/soc/qcom/ubwc.h b/include/linux/soc/qcom/ubwc.h index 1ed8b1b16bc9..0a4edfe3d96d 100644 --- a/include/linux/soc/qcom/ubwc.h +++ b/include/linux/soc/qcom/ubwc.h @@ -52,6 +52,7 @@ struct qcom_ubwc_cfg_data { #define UBWC_4_0 0x40000000 #define UBWC_4_3 0x40030000 #define UBWC_5_0 0x50000000 +#define UBWC_6_0 0x60000000 #if IS_ENABLED(CONFIG_QCOM_UBWC_CONFIG) const struct qcom_ubwc_cfg_data *qcom_ubwc_config_get_data(void); -- cgit v1.2.3 From 603c646f001008eaf8b5a7a888043e5cc8c494a2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:53 -0700 Subject: coco/tsm: Introduce a core device for TEE Security Managers A "TSM" is a platform component that provides an API for securely provisioning resources for a confidential guest (TVM) to consume. The name originates from the PCI specification for platform agent that carries out operations for PCIe TDISP (TEE Device Interface Security Protocol). Instances of this core device are parented by a device representing the platform security function like CONFIG_CRYPTO_DEV_CCP or CONFIG_INTEL_TDX_HOST. This device interface is a frontend to the aspects of a TSM and TEE I/O that are cross-architecture common. This includes mechanisms like enumerating available platform TEE I/O capabilities and provisioning connections between the platform TSM and device DSMs (Device Security Manager (TDISP)). For now this is just the scaffolding for registering a TSM device sysfs interface. Cc: Xu Yilun Reviewed-by: Jonathan Cameron Co-developed-by: Aneesh Kumar K.V (Arm) Signed-off-by: Aneesh Kumar K.V (Arm) Acked-by: Bjorn Helgaas Reviewed-by: Alexey Kardashevskiy Link: https://patch.msgid.link/20251031212902.2256310-2-dan.j.williams@intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-class-tsm | 9 +++ MAINTAINERS | 2 +- drivers/virt/coco/Kconfig | 3 + drivers/virt/coco/Makefile | 1 + drivers/virt/coco/tsm-core.c | 93 +++++++++++++++++++++++++++++++ include/linux/tsm.h | 11 ++++ 6 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 Documentation/ABI/testing/sysfs-class-tsm create mode 100644 drivers/virt/coco/tsm-core.c (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-tsm b/Documentation/ABI/testing/sysfs-class-tsm new file mode 100644 index 000000000000..2949468deaf7 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-tsm @@ -0,0 +1,9 @@ +What: /sys/class/tsm/tsmN +Contact: linux-coco@lists.linux.dev +Description: + "tsmN" is a device that represents the generic attributes of a + platform TEE Security Manager. It is typically a child of a + platform enumerated TSM device. /sys/class/tsm/tsmN/uevent + signals when the PCI layer is able to support establishment of + link encryption and other device-security features coordinated + through a platform tsm. diff --git a/MAINTAINERS b/MAINTAINERS index 46bd8e033042..b8c9929532ed 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26112,7 +26112,7 @@ M: David Lechner S: Maintained F: Documentation/devicetree/bindings/trigger-source/* -TRUSTED SECURITY MODULE (TSM) INFRASTRUCTURE +TRUSTED EXECUTION ENVIRONMENT SECURITY MANAGER (TSM) M: Dan Williams L: linux-coco@lists.linux.dev S: Maintained diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig index 819a97e8ba99..bb0c6d6ddcc8 100644 --- a/drivers/virt/coco/Kconfig +++ b/drivers/virt/coco/Kconfig @@ -14,3 +14,6 @@ source "drivers/virt/coco/tdx-guest/Kconfig" source "drivers/virt/coco/arm-cca-guest/Kconfig" source "drivers/virt/coco/guest/Kconfig" + +config TSM + bool diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile index f918bbb61737..cb52021912b3 100644 --- a/drivers/virt/coco/Makefile +++ b/drivers/virt/coco/Makefile @@ -7,4 +7,5 @@ obj-$(CONFIG_ARM_PKVM_GUEST) += pkvm-guest/ obj-$(CONFIG_SEV_GUEST) += sev-guest/ obj-$(CONFIG_INTEL_TDX_GUEST) += tdx-guest/ obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest/ +obj-$(CONFIG_TSM) += tsm-core.o obj-$(CONFIG_TSM_GUEST) += guest/ diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c new file mode 100644 index 000000000000..347507cc5e3f --- /dev/null +++ b/drivers/virt/coco/tsm-core.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024-2025 Intel Corporation. All rights reserved. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +static struct class *tsm_class; +static DECLARE_RWSEM(tsm_rwsem); +static DEFINE_IDA(tsm_ida); + +static struct tsm_dev *alloc_tsm_dev(struct device *parent) +{ + struct device *dev; + int id; + + struct tsm_dev *tsm_dev __free(kfree) = + kzalloc(sizeof(*tsm_dev), GFP_KERNEL); + if (!tsm_dev) + return ERR_PTR(-ENOMEM); + + id = ida_alloc(&tsm_ida, GFP_KERNEL); + if (id < 0) + return ERR_PTR(id); + + tsm_dev->id = id; + dev = &tsm_dev->dev; + dev->parent = parent; + dev->class = tsm_class; + device_initialize(dev); + + return no_free_ptr(tsm_dev); +} + +struct tsm_dev *tsm_register(struct device *parent) +{ + struct tsm_dev *tsm_dev __free(put_tsm_dev) = alloc_tsm_dev(parent); + struct device *dev; + int rc; + + if (IS_ERR(tsm_dev)) + return tsm_dev; + + dev = &tsm_dev->dev; + rc = dev_set_name(dev, "tsm%d", tsm_dev->id); + if (rc) + return ERR_PTR(rc); + + rc = device_add(dev); + if (rc) + return ERR_PTR(rc); + + return no_free_ptr(tsm_dev); +} +EXPORT_SYMBOL_GPL(tsm_register); + +void tsm_unregister(struct tsm_dev *tsm_dev) +{ + device_unregister(&tsm_dev->dev); +} +EXPORT_SYMBOL_GPL(tsm_unregister); + +static void tsm_release(struct device *dev) +{ + struct tsm_dev *tsm_dev = container_of(dev, typeof(*tsm_dev), dev); + + ida_free(&tsm_ida, tsm_dev->id); + kfree(tsm_dev); +} + +static int __init tsm_init(void) +{ + tsm_class = class_create("tsm"); + if (IS_ERR(tsm_class)) + return PTR_ERR(tsm_class); + + tsm_class->dev_release = tsm_release; + return 0; +} +module_init(tsm_init) + +static void __exit tsm_exit(void) +{ + class_destroy(tsm_class); +} +module_exit(tsm_exit) + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TEE Security Manager Class Device"); diff --git a/include/linux/tsm.h b/include/linux/tsm.h index 431054810dca..cd97c63ffa32 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -5,6 +5,7 @@ #include #include #include +#include #define TSM_REPORT_INBLOB_MAX 64 #define TSM_REPORT_OUTBLOB_MAX SZ_32K @@ -107,6 +108,16 @@ struct tsm_report_ops { bool (*report_bin_attr_visible)(int n); }; +struct tsm_dev { + struct device dev; + int id; +}; + +DEFINE_FREE(put_tsm_dev, struct tsm_dev *, + if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) + int tsm_report_register(const struct tsm_report_ops *ops, void *priv); int tsm_report_unregister(const struct tsm_report_ops *ops); +struct tsm_dev *tsm_register(struct device *parent); +void tsm_unregister(struct tsm_dev *tsm_dev); #endif /* __TSM_H */ -- cgit v1.2.3 From f16469ee733ac52b2373216803699cbb05e82786 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:54 -0700 Subject: PCI/IDE: Enumerate Selective Stream IDE capabilities Link encryption is a new PCIe feature enumerated by "PCIe r7.0 section 7.9.26 IDE Extended Capability". It is both a standalone port + endpoint capability, and a building block for the security protocol defined by "PCIe r7.0 section 11 TEE Device Interface Security Protocol (TDISP)". That protocol coordinates device security setup between a platform TSM (TEE Security Manager) and a device DSM (Device Security Manager). While the platform TSM can allocate resources like Stream ID and manage keys, it still requires system software to manage the IDE capability register block. Add register definitions and basic enumeration in preparation for Selective IDE Stream establishment. A follow on change selects the new CONFIG_PCI_IDE symbol. Note that while the IDE specification defines both a point-to-point "Link Stream" and a Root Port to endpoint "Selective Stream", only "Selective Stream" is considered for Linux as that is the predominant mode expected by Trusted Execution Environment Security Managers (TSMs), and it is the security model that limits the number of PCI components within the TCB in a PCIe topology with switches. Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Reviewed-by: Jonathan Cameron Reviewed-by: Alexey Kardashevskiy Reviewed-by: Aneesh Kumar K.V Link: https://patch.msgid.link/20251031212902.2256310-3-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/Kconfig | 3 ++ drivers/pci/Makefile | 1 + drivers/pci/ide.c | 88 +++++++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 6 +++ drivers/pci/probe.c | 1 + include/linux/pci.h | 7 ++++ include/uapi/linux/pci_regs.h | 81 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 187 insertions(+) create mode 100644 drivers/pci/ide.c (limited to 'include/linux') diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index f94f5d384362..b28423e2057f 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -122,6 +122,9 @@ config XEN_PCIDEV_FRONTEND config PCI_ATS bool +config PCI_IDE + bool + config PCI_DOE bool "Enable PCI Data Object Exchange (DOE) support" help diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 67647f1880fb..6612256fd37d 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_PCI_P2PDMA) += p2pdma.o obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o obj-$(CONFIG_VGA_ARB) += vgaarb.o obj-$(CONFIG_PCI_DOE) += doe.o +obj-$(CONFIG_PCI_IDE) += ide.o obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o obj-$(CONFIG_PCI_NPEM) += npem.o obj-$(CONFIG_PCIE_TPH) += tph.o diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c new file mode 100644 index 000000000000..26866edf91b4 --- /dev/null +++ b/drivers/pci/ide.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2024-2025 Intel Corporation. All rights reserved. */ + +/* PCIe r7.0 section 6.33 Integrity & Data Encryption (IDE) */ + +#define dev_fmt(fmt) "PCI/IDE: " fmt +#include +#include +#include + +#include "pci.h" + +static int __sel_ide_offset(u16 ide_cap, u8 nr_link_ide, u8 stream_index, + u8 nr_ide_mem) +{ + u32 offset = ide_cap + PCI_IDE_LINK_STREAM_0 + + nr_link_ide * PCI_IDE_LINK_BLOCK_SIZE; + + /* + * Assume a constant number of address association resources per stream + * index + */ + return offset + stream_index * PCI_IDE_SEL_BLOCK_SIZE(nr_ide_mem); +} + +void pci_ide_init(struct pci_dev *pdev) +{ + u16 nr_link_ide, nr_ide_mem, nr_streams; + u16 ide_cap; + u32 val; + + if (!pci_is_pcie(pdev)) + return; + + ide_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_IDE); + if (!ide_cap) + return; + + pci_read_config_dword(pdev, ide_cap + PCI_IDE_CAP, &val); + if ((val & PCI_IDE_CAP_SELECTIVE) == 0) + return; + + /* + * Require endpoint IDE capability to be paired with IDE Root Port IDE + * capability. + */ + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) { + struct pci_dev *rp = pcie_find_root_port(pdev); + + if (!rp->ide_cap) + return; + } + + pdev->ide_cfg = FIELD_GET(PCI_IDE_CAP_SEL_CFG, val); + pdev->ide_tee_limit = FIELD_GET(PCI_IDE_CAP_TEE_LIMITED, val); + + if (val & PCI_IDE_CAP_LINK) + nr_link_ide = 1 + FIELD_GET(PCI_IDE_CAP_LINK_TC_NUM, val); + else + nr_link_ide = 0; + + nr_ide_mem = 0; + nr_streams = 1 + FIELD_GET(PCI_IDE_CAP_SEL_NUM, val); + for (u16 i = 0; i < nr_streams; i++) { + int pos = __sel_ide_offset(ide_cap, nr_link_ide, i, nr_ide_mem); + int nr_assoc; + u32 val; + + pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CAP, &val); + + /* + * Let's not entertain streams that do not have a constant + * number of address association blocks + */ + nr_assoc = FIELD_GET(PCI_IDE_SEL_CAP_ASSOC_NUM, val); + if (i && (nr_assoc != nr_ide_mem)) { + pci_info(pdev, "Unsupported Selective Stream %d capability, SKIP the rest\n", i); + nr_streams = i; + break; + } + + nr_ide_mem = nr_assoc; + } + + pdev->ide_cap = ide_cap; + pdev->nr_link_ide = nr_link_ide; + pdev->nr_ide_mem = nr_ide_mem; +} diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 4492b809094b..86ef13e7cece 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -613,6 +613,12 @@ static inline void pci_doe_sysfs_init(struct pci_dev *pdev) { } static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { } #endif +#ifdef CONFIG_PCI_IDE +void pci_ide_init(struct pci_dev *dev); +#else +static inline void pci_ide_init(struct pci_dev *dev) { } +#endif + /** * pci_dev_set_io_state - Set the new error state if possible. * diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 0ce98e18b5a8..4c55020f3ddf 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2667,6 +2667,7 @@ static void pci_init_capabilities(struct pci_dev *dev) pci_doe_init(dev); /* Data Object Exchange */ pci_tph_init(dev); /* TLP Processing Hints */ pci_rebar_init(dev); /* Resizable BAR */ + pci_ide_init(dev); /* Link Integrity and Data Encryption */ pcie_report_downtraining(dev); pci_init_reset_methods(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..4402ca931124 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -539,6 +539,13 @@ struct pci_dev { #endif #ifdef CONFIG_PCI_NPEM struct npem *npem; /* Native PCIe Enclosure Management */ +#endif +#ifdef CONFIG_PCI_IDE + u16 ide_cap; /* Link Integrity & Data Encryption */ + u8 nr_ide_mem; /* Address association resources for streams */ + u8 nr_link_ide; /* Link Stream count (Selective Stream offset) */ + unsigned int ide_cfg:1; /* Config cycles over IDE */ + unsigned int ide_tee_limit:1; /* Disallow T=0 traffic over IDE */ #endif u16 acs_cap; /* ACS Capability offset */ u8 supported_speeds; /* Supported Link Speeds Vector */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 07e06aafec50..05bd22d9e352 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -754,6 +754,7 @@ #define PCI_EXT_CAP_ID_NPEM 0x29 /* Native PCIe Enclosure Management */ #define PCI_EXT_CAP_ID_PL_32GT 0x2A /* Physical Layer 32.0 GT/s */ #define PCI_EXT_CAP_ID_DOE 0x2E /* Data Object Exchange */ +#define PCI_EXT_CAP_ID_IDE 0x30 /* Integrity and Data Encryption */ #define PCI_EXT_CAP_ID_PL_64GT 0x31 /* Physical Layer 64.0 GT/s */ #define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_64GT @@ -1249,4 +1250,84 @@ #define PCI_DVSEC_CXL_PORT_CTL 0x0c #define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 +/* Integrity and Data Encryption Extended Capability */ +#define PCI_IDE_CAP 0x04 +#define PCI_IDE_CAP_LINK 0x1 /* Link IDE Stream Supported */ +#define PCI_IDE_CAP_SELECTIVE 0x2 /* Selective IDE Streams Supported */ +#define PCI_IDE_CAP_FLOWTHROUGH 0x4 /* Flow-Through IDE Stream Supported */ +#define PCI_IDE_CAP_PARTIAL_HEADER_ENC 0x8 /* Partial Header Encryption Supported */ +#define PCI_IDE_CAP_AGGREGATION 0x10 /* Aggregation Supported */ +#define PCI_IDE_CAP_PCRC 0x20 /* PCRC Supported */ +#define PCI_IDE_CAP_IDE_KM 0x40 /* IDE_KM Protocol Supported */ +#define PCI_IDE_CAP_SEL_CFG 0x80 /* Selective IDE for Config Request Support */ +#define PCI_IDE_CAP_ALG __GENMASK(12, 8) /* Supported Algorithms */ +#define PCI_IDE_CAP_ALG_AES_GCM_256 0 /* AES-GCM 256 key size, 96b MAC */ +#define PCI_IDE_CAP_LINK_TC_NUM __GENMASK(15, 13) /* Link IDE TCs */ +#define PCI_IDE_CAP_SEL_NUM __GENMASK(23, 16) /* Supported Selective IDE Streams */ +#define PCI_IDE_CAP_TEE_LIMITED 0x1000000 /* TEE-Limited Stream Supported */ +#define PCI_IDE_CTL 0x08 +#define PCI_IDE_CTL_FLOWTHROUGH_IDE 0x4 /* Flow-Through IDE Stream Enabled */ + +#define PCI_IDE_LINK_STREAM_0 0xc /* First Link Stream Register Block */ +#define PCI_IDE_LINK_BLOCK_SIZE 8 +/* Link IDE Stream block, up to PCI_IDE_CAP_LINK_TC_NUM */ +#define PCI_IDE_LINK_CTL_0 0x00 /* First Link Control Register Offset in block */ +#define PCI_IDE_LINK_CTL_EN 0x1 /* Link IDE Stream Enable */ +#define PCI_IDE_LINK_CTL_TX_AGGR_NPR __GENMASK(3, 2) /* Tx Aggregation Mode NPR */ +#define PCI_IDE_LINK_CTL_TX_AGGR_PR __GENMASK(5, 4) /* Tx Aggregation Mode PR */ +#define PCI_IDE_LINK_CTL_TX_AGGR_CPL __GENMASK(7, 6) /* Tx Aggregation Mode CPL */ +#define PCI_IDE_LINK_CTL_PCRC_EN 0x100 /* PCRC Enable */ +#define PCI_IDE_LINK_CTL_PART_ENC __GENMASK(13, 10) /* Partial Header Encryption Mode */ +#define PCI_IDE_LINK_CTL_ALG __GENMASK(18, 14) /* Selection from PCI_IDE_CAP_ALG */ +#define PCI_IDE_LINK_CTL_TC __GENMASK(21, 19) /* Traffic Class */ +#define PCI_IDE_LINK_CTL_ID __GENMASK(31, 24) /* Stream ID */ +#define PCI_IDE_LINK_STS_0 0x4 /* First Link Status Register Offset in block */ +#define PCI_IDE_LINK_STS_STATE __GENMASK(3, 0) /* Link IDE Stream State */ +#define PCI_IDE_LINK_STS_IDE_FAIL 0x80000000 /* IDE fail message received */ + +/* Selective IDE Stream block, up to PCI_IDE_CAP_SELECTIVE_STREAMS_NUM */ +/* Selective IDE Stream Capability Register */ +#define PCI_IDE_SEL_CAP 0x00 +#define PCI_IDE_SEL_CAP_ASSOC_NUM __GENMASK(3, 0) +/* Selective IDE Stream Control Register */ +#define PCI_IDE_SEL_CTL 0x04 +#define PCI_IDE_SEL_CTL_EN 0x1 /* Selective IDE Stream Enable */ +#define PCI_IDE_SEL_CTL_TX_AGGR_NPR __GENMASK(3, 2) /* Tx Aggregation Mode NPR */ +#define PCI_IDE_SEL_CTL_TX_AGGR_PR __GENMASK(5, 4) /* Tx Aggregation Mode PR */ +#define PCI_IDE_SEL_CTL_TX_AGGR_CPL __GENMASK(7, 6) /* Tx Aggregation Mode CPL */ +#define PCI_IDE_SEL_CTL_PCRC_EN 0x100 /* PCRC Enable */ +#define PCI_IDE_SEL_CTL_CFG_EN 0x200 /* Selective IDE for Configuration Requests */ +#define PCI_IDE_SEL_CTL_PART_ENC __GENMASK(13, 10) /* Partial Header Encryption Mode */ +#define PCI_IDE_SEL_CTL_ALG __GENMASK(18, 14) /* Selection from PCI_IDE_CAP_ALG */ +#define PCI_IDE_SEL_CTL_TC __GENMASK(21, 19) /* Traffic Class */ +#define PCI_IDE_SEL_CTL_DEFAULT 0x400000 /* Default Stream */ +#define PCI_IDE_SEL_CTL_TEE_LIMITED 0x800000 /* TEE-Limited Stream */ +#define PCI_IDE_SEL_CTL_ID __GENMASK(31, 24) /* Stream ID */ +#define PCI_IDE_SEL_CTL_ID_MAX 255 +/* Selective IDE Stream Status Register */ +#define PCI_IDE_SEL_STS 0x08 +#define PCI_IDE_SEL_STS_STATE __GENMASK(3, 0) /* Selective IDE Stream State */ +#define PCI_IDE_SEL_STS_STATE_INSECURE 0 +#define PCI_IDE_SEL_STS_STATE_SECURE 2 +#define PCI_IDE_SEL_STS_IDE_FAIL 0x80000000 /* IDE fail message received */ +/* IDE RID Association Register 1 */ +#define PCI_IDE_SEL_RID_1 0x0c +#define PCI_IDE_SEL_RID_1_LIMIT __GENMASK(23, 8) +/* IDE RID Association Register 2 */ +#define PCI_IDE_SEL_RID_2 0x10 +#define PCI_IDE_SEL_RID_2_VALID 0x1 +#define PCI_IDE_SEL_RID_2_BASE __GENMASK(23, 8) +#define PCI_IDE_SEL_RID_2_SEG __GENMASK(31, 24) +/* Selective IDE Address Association Register Block, up to PCI_IDE_SEL_CAP_ASSOC_NUM */ +#define PCI_IDE_SEL_ADDR_BLOCK_SIZE 12 +#define PCI_IDE_SEL_ADDR_1(x) (20 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) +#define PCI_IDE_SEL_ADDR_1_VALID 0x1 +#define PCI_IDE_SEL_ADDR_1_BASE_LOW __GENMASK(19, 8) +#define PCI_IDE_SEL_ADDR_1_LIMIT_LOW __GENMASK(31, 20) +/* IDE Address Association Register 2 is "Memory Limit Upper" */ +#define PCI_IDE_SEL_ADDR_2(x) (24 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) +/* IDE Address Association Register 3 is "Memory Base Upper" */ +#define PCI_IDE_SEL_ADDR_3(x) (28 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) +#define PCI_IDE_SEL_BLOCK_SIZE(nr_assoc) (20 + PCI_IDE_SEL_ADDR_BLOCK_SIZE * (nr_assoc)) + #endif /* LINUX_PCI_REGS_H */ -- cgit v1.2.3 From 215afa89d249bb095126cf00f8be719e421c75e9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:55 -0700 Subject: PCI: Introduce pci_walk_bus_reverse(), for_each_pci_dev_reverse() PCI/TSM, the PCI core functionality for the PCIe TEE Device Interface Security Protocol (TDISP), has a need to walk all subordinate functions of a Device Security Manager (DSM) to setup a device security context. A DSM is physical function 0 of multi-function or SR-IOV device endpoint, or it is an upstream switch port. In error scenarios or when a TEE Security Manager (TSM) device is removed it needs to unwind all established DSM contexts. Introduce reverse versions of PCI device iteration helpers to mirror the setup path and ensure that dependent children are handled before parents. Cc: Greg Kroah-Hartman Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-4-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/base/bus.c | 38 ++++++++++++++++++++++++++++ drivers/pci/bus.c | 39 +++++++++++++++++++++++++++++ drivers/pci/search.c | 62 ++++++++++++++++++++++++++++++++++++++++------ include/linux/device/bus.h | 3 +++ include/linux/pci.h | 11 ++++++++ 5 files changed, 145 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 5e75e1bce551..d19dae8f9d1b 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -334,6 +334,19 @@ static struct device *next_device(struct klist_iter *i) return dev; } +static struct device *prev_device(struct klist_iter *i) +{ + struct klist_node *n = klist_prev(i); + struct device *dev = NULL; + struct device_private *dev_prv; + + if (n) { + dev_prv = to_device_private_bus(n); + dev = dev_prv->device; + } + return dev; +} + /** * bus_for_each_dev - device iterator. * @bus: bus type. @@ -414,6 +427,31 @@ struct device *bus_find_device(const struct bus_type *bus, } EXPORT_SYMBOL_GPL(bus_find_device); +struct device *bus_find_device_reverse(const struct bus_type *bus, + struct device *start, const void *data, + device_match_t match) +{ + struct subsys_private *sp = bus_to_subsys(bus); + struct klist_iter i; + struct device *dev; + + if (!sp) + return NULL; + + klist_iter_init_node(&sp->klist_devices, &i, + (start ? &start->p->knode_bus : NULL)); + while ((dev = prev_device(&i))) { + if (match(dev, data)) { + get_device(dev); + break; + } + } + klist_iter_exit(&i); + subsys_put(sp); + return dev; +} +EXPORT_SYMBOL_GPL(bus_find_device_reverse); + static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index f26aec6ff588..b8b17f825bc0 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include #include @@ -432,6 +433,27 @@ static int __pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void return ret; } +static int __pci_walk_bus_reverse(struct pci_bus *top, + int (*cb)(struct pci_dev *, void *), + void *userdata) +{ + struct pci_dev *dev; + int ret = 0; + + list_for_each_entry_reverse(dev, &top->devices, bus_list) { + if (dev->subordinate) { + ret = __pci_walk_bus_reverse(dev->subordinate, cb, + userdata); + if (ret) + break; + } + ret = cb(dev, userdata); + if (ret) + break; + } + return ret; +} + /** * pci_walk_bus - walk devices on/under bus, calling callback. * @top: bus whose devices should be walked @@ -453,6 +475,23 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void } EXPORT_SYMBOL_GPL(pci_walk_bus); +/** + * pci_walk_bus_reverse - walk devices on/under bus, calling callback. + * @top: bus whose devices should be walked + * @cb: callback to be called for each device found + * @userdata: arbitrary pointer to be passed to callback + * + * Same semantics as pci_walk_bus(), but walks the bus in reverse order. + */ +void pci_walk_bus_reverse(struct pci_bus *top, + int (*cb)(struct pci_dev *, void *), void *userdata) +{ + down_read(&pci_bus_sem); + __pci_walk_bus_reverse(top, cb, userdata); + up_read(&pci_bus_sem); +} +EXPORT_SYMBOL_GPL(pci_walk_bus_reverse); + void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata) { lockdep_assert_held(&pci_bus_sem); diff --git a/drivers/pci/search.c b/drivers/pci/search.c index 53840634fbfc..e6e84dc62e82 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -282,6 +282,45 @@ static struct pci_dev *pci_get_dev_by_id(const struct pci_device_id *id, return pdev; } +static struct pci_dev *pci_get_dev_by_id_reverse(const struct pci_device_id *id, + struct pci_dev *from) +{ + struct device *dev; + struct device *dev_start = NULL; + struct pci_dev *pdev = NULL; + + if (from) + dev_start = &from->dev; + dev = bus_find_device_reverse(&pci_bus_type, dev_start, (void *)id, + match_pci_dev_by_id); + if (dev) + pdev = to_pci_dev(dev); + pci_dev_put(from); + return pdev; +} + +enum pci_search_direction { + PCI_SEARCH_FORWARD, + PCI_SEARCH_REVERSE, +}; + +static struct pci_dev *__pci_get_subsys(unsigned int vendor, unsigned int device, + unsigned int ss_vendor, unsigned int ss_device, + struct pci_dev *from, enum pci_search_direction dir) +{ + struct pci_device_id id = { + .vendor = vendor, + .device = device, + .subvendor = ss_vendor, + .subdevice = ss_device, + }; + + if (dir == PCI_SEARCH_FORWARD) + return pci_get_dev_by_id(&id, from); + else + return pci_get_dev_by_id_reverse(&id, from); +} + /** * pci_get_subsys - begin or continue searching for a PCI device by vendor/subvendor/device/subdevice id * @vendor: PCI vendor id to match, or %PCI_ANY_ID to match all vendor ids @@ -302,14 +341,8 @@ struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from) { - struct pci_device_id id = { - .vendor = vendor, - .device = device, - .subvendor = ss_vendor, - .subdevice = ss_device, - }; - - return pci_get_dev_by_id(&id, from); + return __pci_get_subsys(vendor, device, ss_vendor, ss_device, from, + PCI_SEARCH_FORWARD); } EXPORT_SYMBOL(pci_get_subsys); @@ -334,6 +367,19 @@ struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device, } EXPORT_SYMBOL(pci_get_device); +/* + * Same semantics as pci_get_device(), except walks the PCI device list + * in reverse discovery order. + */ +struct pci_dev *pci_get_device_reverse(unsigned int vendor, + unsigned int device, + struct pci_dev *from) +{ + return __pci_get_subsys(vendor, device, PCI_ANY_ID, PCI_ANY_ID, from, + PCI_SEARCH_REVERSE); +} +EXPORT_SYMBOL(pci_get_device_reverse); + /** * pci_get_class - begin or continue searching for a PCI device by class * @class: search for a PCI device with this class designation diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index f5a56efd2bd6..99b1002b3e31 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -150,6 +150,9 @@ int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data, device_iter_t fn); struct device *bus_find_device(const struct bus_type *bus, struct device *start, const void *data, device_match_t match); +struct device *bus_find_device_reverse(const struct bus_type *bus, + struct device *start, const void *data, + device_match_t match); /** * bus_find_device_by_name - device iterator for locating a particular device * of a specific name. diff --git a/include/linux/pci.h b/include/linux/pci.h index 4402ca931124..b6a12a82be12 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -582,6 +582,8 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus); #define to_pci_dev(n) container_of(n, struct pci_dev, dev) #define for_each_pci_dev(d) while ((d = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL) +#define for_each_pci_dev_reverse(d) \ + while ((d = pci_get_device_reverse(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL) static inline int pci_channel_offline(struct pci_dev *pdev) { @@ -1242,6 +1244,8 @@ u64 pci_get_dsn(struct pci_dev *dev); struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device, struct pci_dev *from); +struct pci_dev *pci_get_device_reverse(unsigned int vendor, unsigned int device, + struct pci_dev *from); struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from); @@ -1661,6 +1665,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata); +void pci_walk_bus_reverse(struct pci_bus *top, + int (*cb)(struct pci_dev *, void *), void *userdata); int pci_cfg_space_size(struct pci_dev *dev); unsigned char pci_bus_max_busnr(struct pci_bus *bus); resource_size_t pcibios_window_alignment(struct pci_bus *bus, @@ -2049,6 +2055,11 @@ static inline struct pci_dev *pci_get_device(unsigned int vendor, struct pci_dev *from) { return NULL; } +static inline struct pci_dev *pci_get_device_reverse(unsigned int vendor, + unsigned int device, + struct pci_dev *from) +{ return NULL; } + static inline struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, unsigned int ss_vendor, -- cgit v1.2.3 From 3225f52cde56f46789a4972d3c54df8a4d75f022 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:56 -0700 Subject: PCI/TSM: Establish Secure Sessions and Link Encryption The PCIe 7.0 specification, section 11, defines the Trusted Execution Environment (TEE) Device Interface Security Protocol (TDISP). This protocol definition builds upon Component Measurement and Authentication (CMA), and link Integrity and Data Encryption (IDE). It adds support for assigning devices (PCI physical or virtual function) to a confidential VM such that the assigned device is enabled to access guest private memory protected by technologies like Intel TDX, AMD SEV-SNP, RISCV COVE, or ARM CCA. The "TSM" (TEE Security Manager) is a concept in the TDISP specification of an agent that mediates between a "DSM" (Device Security Manager) and system software in both a VMM and a confidential VM. A VMM uses TSM ABIs to setup link security and assign devices. A confidential VM uses TSM ABIs to transition an assigned device into the TDISP "RUN" state and validate its configuration. From a Linux perspective the TSM abstracts many of the details of TDISP, IDE, and CMA. Some of those details leak through at times, but for the most part TDISP is an internal implementation detail of the TSM. CONFIG_PCI_TSM adds an "authenticated" attribute and "tsm/" subdirectory to pci-sysfs. Consider that the TSM driver may itself be a PCI driver. Userspace can watch for the arrival of a "TSM" device, /sys/class/tsm/tsm0/uevent KOBJ_CHANGE, to know when the PCI core has initialized TSM services. The operations that can be executed against a PCI device are split into two mutually exclusive operation sets, "Link" and "Security" (struct pci_tsm_{link,security}_ops). The "Link" operations manage physical link security properties and communication with the device's Device Security Manager firmware. These are the host side operations in TDISP. The "Security" operations coordinate the security state of the assigned virtual device (TDI). These are the guest side operations in TDISP. Only "link" (Secure Session and physical Link Encryption) operations are defined at this stage. There are placeholders for the device security (Trusted Computing Base entry / exit) operations. The locking allows for multiple devices to be executing commands simultaneously, one outstanding command per-device and an rwsem synchronizes the implementation relative to TSM registration/unregistration events. Thanks to Wu Hao for his work on an early draft of this support. Cc: Lukas Wunner Cc: Samuel Ortiz Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Alexey Kardashevskiy Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Link: https://patch.msgid.link/20251031212902.2256310-5-dan.j.williams@intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-pci | 51 +++ Documentation/driver-api/pci/index.rst | 1 + Documentation/driver-api/pci/tsm.rst | 21 ++ MAINTAINERS | 4 +- drivers/pci/Kconfig | 15 + drivers/pci/Makefile | 1 + drivers/pci/doe.c | 2 - drivers/pci/pci-sysfs.c | 4 + drivers/pci/pci.h | 10 + drivers/pci/probe.c | 3 + drivers/pci/remove.c | 6 + drivers/pci/tsm.c | 643 ++++++++++++++++++++++++++++++++ drivers/virt/coco/tsm-core.c | 46 ++- include/linux/pci-doe.h | 4 + include/linux/pci-tsm.h | 157 ++++++++ include/linux/pci.h | 3 + include/linux/tsm.h | 5 +- include/uapi/linux/pci_regs.h | 1 + 18 files changed, 971 insertions(+), 6 deletions(-) create mode 100644 Documentation/driver-api/pci/tsm.rst create mode 100644 drivers/pci/tsm.c create mode 100644 include/linux/pci-tsm.h (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 92debe879ffb..6ffe02f854d6 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -621,3 +621,54 @@ Description: number extended capability. The file is read only and due to the possible sensitivity of accessible serial numbers, admin only. + +What: /sys/bus/pci/devices/.../tsm/ +Contact: linux-coco@lists.linux.dev +Description: + This directory only appears if a physical device function + supports authentication (PCIe CMA-SPDM), interface security + (PCIe TDISP), and is accepted for secure operation by the + platform TSM driver. This attribute directory appears + dynamically after the platform TSM driver loads. So, only after + the /sys/class/tsm/tsm0 device arrives can tools assume that + devices without a tsm/ attribute directory will never have one; + before that, the security capabilities of the device relative to + the platform TSM are unknown. See + Documentation/ABI/testing/sysfs-class-tsm. + +What: /sys/bus/pci/devices/.../tsm/connect +Contact: linux-coco@lists.linux.dev +Description: + (RW) Write the name of a TSM (TEE Security Manager) device from + /sys/class/tsm to this file to establish a connection with the + device. This typically includes an SPDM (DMTF Security + Protocols and Data Models) session over PCIe DOE (Data Object + Exchange) and may also include PCIe IDE (Integrity and Data + Encryption) establishment. Reads from this attribute return the + name of the connected TSM or the empty string if not + connected. A TSM device signals its readiness to accept PCI + connection via a KOBJ_CHANGE event. + +What: /sys/bus/pci/devices/.../tsm/disconnect +Contact: linux-coco@lists.linux.dev +Description: + (WO) Write the name of the TSM device that was specified + to 'connect' to teardown the connection. + +What: /sys/bus/pci/devices/.../authenticated +Contact: linux-pci@vger.kernel.org +Description: + When the device's tsm/ directory is present device + authentication (PCIe CMA-SPDM) and link encryption (PCIe IDE) + are handled by the platform TSM (TEE Security Manager). When the + tsm/ directory is not present this attribute reflects only the + native CMA-SPDM authentication state with the kernel's + certificate store. + + If the attribute is not present, it indicates that + authentication is unsupported by the device, or the TSM has no + available authentication methods for the device. + + When present and the tsm/ attribute directory is present, the + authenticated attribute is an alias for the device 'connect' + state. See the 'tsm/connect' attribute for more details. diff --git a/Documentation/driver-api/pci/index.rst b/Documentation/driver-api/pci/index.rst index a38e475cdbe3..9e1b801d0f74 100644 --- a/Documentation/driver-api/pci/index.rst +++ b/Documentation/driver-api/pci/index.rst @@ -10,6 +10,7 @@ The Linux PCI driver implementer's API guide pci p2pdma + tsm .. only:: subproject and html diff --git a/Documentation/driver-api/pci/tsm.rst b/Documentation/driver-api/pci/tsm.rst new file mode 100644 index 000000000000..232b92bec93f --- /dev/null +++ b/Documentation/driver-api/pci/tsm.rst @@ -0,0 +1,21 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +======================================================== +PCI Trusted Execution Environment Security Manager (TSM) +======================================================== + +Subsystem Interfaces +==================== + +.. kernel-doc:: include/linux/pci-ide.h + :internal: + +.. kernel-doc:: drivers/pci/ide.c + :export: + +.. kernel-doc:: include/linux/pci-tsm.h + :internal: + +.. kernel-doc:: drivers/pci/tsm.c + :export: diff --git a/MAINTAINERS b/MAINTAINERS index b8c9929532ed..f1c8793bf03e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26118,8 +26118,10 @@ L: linux-coco@lists.linux.dev S: Maintained F: Documentation/ABI/testing/configfs-tsm-report F: Documentation/driver-api/coco/ +F: Documentation/driver-api/pci/tsm.rst +F: drivers/pci/tsm.c F: drivers/virt/coco/guest/ -F: include/linux/tsm*.h +F: include/linux/*tsm*.h F: samples/tsm-mr/ TRUSTED SERVICES TEE DRIVER diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index b28423e2057f..00b0210e1f1d 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -125,6 +125,21 @@ config PCI_ATS config PCI_IDE bool +config PCI_TSM + bool "PCI TSM: Device security protocol support" + select PCI_IDE + select PCI_DOE + select TSM + help + The TEE (Trusted Execution Environment) Device Interface + Security Protocol (TDISP) defines a "TSM" as a platform agent + that manages device authentication, link encryption, link + integrity protection, and assignment of PCI device functions + (virtual or physical) to confidential computing VMs that can + access (DMA) guest private memory. + + Enable a platform TSM driver to use this capability. + config PCI_DOE bool "Enable PCI Data Object Exchange (DOE) support" help diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 6612256fd37d..2c545f877062 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o obj-$(CONFIG_VGA_ARB) += vgaarb.o obj-$(CONFIG_PCI_DOE) += doe.o obj-$(CONFIG_PCI_IDE) += ide.o +obj-$(CONFIG_PCI_TSM) += tsm.o obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o obj-$(CONFIG_PCI_NPEM) += npem.o obj-$(CONFIG_PCIE_TPH) += tph.o diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c index aae9a8a00406..62be9c8dbc52 100644 --- a/drivers/pci/doe.c +++ b/drivers/pci/doe.c @@ -24,8 +24,6 @@ #include "pci.h" -#define PCI_DOE_FEATURE_DISCOVERY 0 - /* Timeout of 1 second from 6.30.2 Operation, PCI Spec r6.0 */ #define PCI_DOE_TIMEOUT HZ #define PCI_DOE_POLL_INTERVAL (PCI_DOE_TIMEOUT / 128) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 9d6f74bd95f8..7f9237a926c2 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1868,6 +1868,10 @@ const struct attribute_group *pci_dev_attr_groups[] = { #endif #ifdef CONFIG_PCI_DOE &pci_doe_sysfs_group, +#endif +#ifdef CONFIG_PCI_TSM + &pci_tsm_auth_attr_group, + &pci_tsm_attr_group, #endif NULL, }; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 86ef13e7cece..6e4cc1c9aa58 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -619,6 +619,16 @@ void pci_ide_init(struct pci_dev *dev); static inline void pci_ide_init(struct pci_dev *dev) { } #endif +#ifdef CONFIG_PCI_TSM +void pci_tsm_init(struct pci_dev *pdev); +void pci_tsm_destroy(struct pci_dev *pdev); +extern const struct attribute_group pci_tsm_attr_group; +extern const struct attribute_group pci_tsm_auth_attr_group; +#else +static inline void pci_tsm_init(struct pci_dev *pdev) { } +static inline void pci_tsm_destroy(struct pci_dev *pdev) { } +#endif + /** * pci_dev_set_io_state - Set the new error state if possible. * diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 4c55020f3ddf..d1467348c169 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2763,6 +2763,9 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) ret = device_add(&dev->dev); WARN_ON(ret < 0); + /* Establish pdev->tsm for newly added (e.g. new SR-IOV VFs) */ + pci_tsm_init(dev); + pci_npem_create(dev); pci_doe_sysfs_init(dev); diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index ce5c25adef55..803391892c4a 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -57,6 +57,12 @@ static void pci_destroy_dev(struct pci_dev *dev) pci_doe_sysfs_teardown(dev); pci_npem_remove(dev); + /* + * While device is in D0 drop the device from TSM link operations + * including unbind and disconnect (IDE + SPDM teardown). + */ + pci_tsm_destroy(dev); + device_del(&dev->dev); down_write(&pci_bus_sem); diff --git a/drivers/pci/tsm.c b/drivers/pci/tsm.c new file mode 100644 index 000000000000..6a2849f77adc --- /dev/null +++ b/drivers/pci/tsm.c @@ -0,0 +1,643 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Interface with platform TEE Security Manager (TSM) objects as defined by + * PCIe r7.0 section 11 TEE Device Interface Security Protocol (TDISP) + * + * Copyright(c) 2024-2025 Intel Corporation. All rights reserved. + */ + +#define dev_fmt(fmt) "PCI/TSM: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "pci.h" + +/* + * Provide a read/write lock against the init / exit of pdev tsm + * capabilities and arrival/departure of a TSM instance + */ +static DECLARE_RWSEM(pci_tsm_rwsem); + +/* + * Count of TSMs registered that support physical link operations vs device + * security state management. + */ +static int pci_tsm_link_count; +static int pci_tsm_devsec_count; + +static const struct pci_tsm_ops *to_pci_tsm_ops(struct pci_tsm *tsm) +{ + return tsm->tsm_dev->pci_ops; +} + +static inline bool is_dsm(struct pci_dev *pdev) +{ + return pdev->tsm && pdev->tsm->dsm_dev == pdev; +} + +static inline bool has_tee(struct pci_dev *pdev) +{ + return pdev->devcap & PCI_EXP_DEVCAP_TEE; +} + +/* 'struct pci_tsm_pf0' wraps 'struct pci_tsm' when ->dsm_dev == ->pdev (self) */ +static struct pci_tsm_pf0 *to_pci_tsm_pf0(struct pci_tsm *tsm) +{ + /* + * All "link" TSM contexts reference the device that hosts the DSM + * interface for a set of devices. Walk to the DSM device and cast its + * ->tsm context to a 'struct pci_tsm_pf0 *'. + */ + struct pci_dev *pf0 = tsm->dsm_dev; + + if (!is_pci_tsm_pf0(pf0) || !is_dsm(pf0)) { + pci_WARN_ONCE(tsm->pdev, 1, "invalid context object\n"); + return NULL; + } + + return container_of(pf0->tsm, struct pci_tsm_pf0, base_tsm); +} + +static void tsm_remove(struct pci_tsm *tsm) +{ + struct pci_dev *pdev; + + if (!tsm) + return; + + pdev = tsm->pdev; + to_pci_tsm_ops(tsm)->remove(tsm); + pdev->tsm = NULL; +} +DEFINE_FREE(tsm_remove, struct pci_tsm *, if (_T) tsm_remove(_T)) + +static void pci_tsm_walk_fns(struct pci_dev *pdev, + int (*cb)(struct pci_dev *pdev, void *data), + void *data) +{ + /* Walk subordinate physical functions */ + for (int i = 0; i < 8; i++) { + struct pci_dev *pf __free(pci_dev_put) = pci_get_slot( + pdev->bus, PCI_DEVFN(PCI_SLOT(pdev->devfn), i)); + + if (!pf) + continue; + + /* on entry function 0 has already run @cb */ + if (i > 0) + cb(pf, data); + + /* walk virtual functions of each pf */ + for (int j = 0; j < pci_num_vf(pf); j++) { + struct pci_dev *vf __free(pci_dev_put) = + pci_get_domain_bus_and_slot( + pci_domain_nr(pf->bus), + pci_iov_virtfn_bus(pf, j), + pci_iov_virtfn_devfn(pf, j)); + + if (!vf) + continue; + + cb(vf, data); + } + } + + /* + * Walk downstream devices, assumes that an upstream DSM is + * limited to downstream physical functions + */ + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM && is_dsm(pdev)) + pci_walk_bus(pdev->subordinate, cb, data); +} + +static void pci_tsm_walk_fns_reverse(struct pci_dev *pdev, + int (*cb)(struct pci_dev *pdev, + void *data), + void *data) +{ + /* Reverse walk downstream devices */ + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM && is_dsm(pdev)) + pci_walk_bus_reverse(pdev->subordinate, cb, data); + + /* Reverse walk subordinate physical functions */ + for (int i = 7; i >= 0; i--) { + struct pci_dev *pf __free(pci_dev_put) = pci_get_slot( + pdev->bus, PCI_DEVFN(PCI_SLOT(pdev->devfn), i)); + + if (!pf) + continue; + + /* reverse walk virtual functions */ + for (int j = pci_num_vf(pf) - 1; j >= 0; j--) { + struct pci_dev *vf __free(pci_dev_put) = + pci_get_domain_bus_and_slot( + pci_domain_nr(pf->bus), + pci_iov_virtfn_bus(pf, j), + pci_iov_virtfn_devfn(pf, j)); + + if (!vf) + continue; + cb(vf, data); + } + + /* on exit, caller will run @cb on function 0 */ + if (i > 0) + cb(pf, data); + } +} + +static int probe_fn(struct pci_dev *pdev, void *dsm) +{ + struct pci_dev *dsm_dev = dsm; + const struct pci_tsm_ops *ops = to_pci_tsm_ops(dsm_dev->tsm); + + pdev->tsm = ops->probe(dsm_dev->tsm->tsm_dev, pdev); + pci_dbg(pdev, "setup TSM context: DSM: %s status: %s\n", + pci_name(dsm_dev), pdev->tsm ? "success" : "failed"); + return 0; +} + +static int pci_tsm_connect(struct pci_dev *pdev, struct tsm_dev *tsm_dev) +{ + int rc; + struct pci_tsm_pf0 *tsm_pf0; + const struct pci_tsm_ops *ops = tsm_dev->pci_ops; + struct pci_tsm *pci_tsm __free(tsm_remove) = ops->probe(tsm_dev, pdev); + + /* connect() mutually exclusive with subfunction pci_tsm_init() */ + lockdep_assert_held_write(&pci_tsm_rwsem); + + if (!pci_tsm) + return -ENXIO; + + pdev->tsm = pci_tsm; + tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); + + /* mutex_intr assumes connect() is always sysfs/user driven */ + ACQUIRE(mutex_intr, lock)(&tsm_pf0->lock); + if ((rc = ACQUIRE_ERR(mutex_intr, &lock))) + return rc; + + rc = ops->connect(pdev); + if (rc) + return rc; + + pdev->tsm = no_free_ptr(pci_tsm); + + /* + * Now that the DSM is established, probe() all the potential + * dependent functions. Failure to probe a function is not fatal + * to connect(), it just disables subsequent security operations + * for that function. + * + * Note this is done unconditionally, without regard to finding + * PCI_EXP_DEVCAP_TEE on the dependent function, for robustness. The DSM + * is the ultimate arbiter of security state relative to a given + * interface id, and if it says it can manage TDISP state of a function, + * let it. + */ + if (has_tee(pdev)) + pci_tsm_walk_fns(pdev, probe_fn, pdev); + return 0; +} + +static ssize_t connect_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct tsm_dev *tsm_dev; + int rc; + + ACQUIRE(rwsem_read_intr, lock)(&pci_tsm_rwsem); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &lock))) + return rc; + + if (!pdev->tsm) + return sysfs_emit(buf, "\n"); + + tsm_dev = pdev->tsm->tsm_dev; + return sysfs_emit(buf, "%s\n", dev_name(&tsm_dev->dev)); +} + +/* Is @tsm_dev managing physical link / session properties... */ +static bool is_link_tsm(struct tsm_dev *tsm_dev) +{ + return tsm_dev && tsm_dev->pci_ops && tsm_dev->pci_ops->link_ops.probe; +} + +/* ...or is @tsm_dev managing device security state ? */ +static bool is_devsec_tsm(struct tsm_dev *tsm_dev) +{ + return tsm_dev && tsm_dev->pci_ops && tsm_dev->pci_ops->devsec_ops.lock; +} + +static ssize_t connect_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int rc, id; + + rc = sscanf(buf, "tsm%d\n", &id); + if (rc != 1) + return -EINVAL; + + ACQUIRE(rwsem_write_kill, lock)(&pci_tsm_rwsem); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &lock))) + return rc; + + if (pdev->tsm) + return -EBUSY; + + struct tsm_dev *tsm_dev __free(put_tsm_dev) = find_tsm_dev(id); + if (!is_link_tsm(tsm_dev)) + return -ENXIO; + + rc = pci_tsm_connect(pdev, tsm_dev); + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(connect); + +static int remove_fn(struct pci_dev *pdev, void *data) +{ + tsm_remove(pdev->tsm); + return 0; +} + +static void __pci_tsm_disconnect(struct pci_dev *pdev) +{ + struct pci_tsm_pf0 *tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); + const struct pci_tsm_ops *ops = to_pci_tsm_ops(pdev->tsm); + + /* disconnect() mutually exclusive with subfunction pci_tsm_init() */ + lockdep_assert_held_write(&pci_tsm_rwsem); + + /* + * disconnect() is uninterruptible as it may be called for device + * teardown + */ + guard(mutex)(&tsm_pf0->lock); + pci_tsm_walk_fns_reverse(pdev, remove_fn, NULL); + ops->disconnect(pdev); +} + +static void pci_tsm_disconnect(struct pci_dev *pdev) +{ + __pci_tsm_disconnect(pdev); + tsm_remove(pdev->tsm); +} + +static ssize_t disconnect_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t len) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct tsm_dev *tsm_dev; + int rc; + + ACQUIRE(rwsem_write_kill, lock)(&pci_tsm_rwsem); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &lock))) + return rc; + + if (!pdev->tsm) + return -ENXIO; + + tsm_dev = pdev->tsm->tsm_dev; + if (!sysfs_streq(buf, dev_name(&tsm_dev->dev))) + return -EINVAL; + + pci_tsm_disconnect(pdev); + return len; +} +static DEVICE_ATTR_WO(disconnect); + +/* The 'authenticated' attribute is exclusive to the presence of a 'link' TSM */ +static bool pci_tsm_link_group_visible(struct kobject *kobj) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + + return pci_tsm_link_count && is_pci_tsm_pf0(pdev); +} +DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(pci_tsm_link); + +/* + * 'link' and 'devsec' TSMs share the same 'tsm/' sysfs group, so the TSM type + * specific attributes need individual visibility checks. + */ +static umode_t pci_tsm_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + if (pci_tsm_link_group_visible(kobj)) { + if (attr == &dev_attr_connect.attr || + attr == &dev_attr_disconnect.attr) + return attr->mode; + } + + return 0; +} + +static bool pci_tsm_group_visible(struct kobject *kobj) +{ + return pci_tsm_link_group_visible(kobj); +} +DEFINE_SYSFS_GROUP_VISIBLE(pci_tsm); + +static struct attribute *pci_tsm_attrs[] = { + &dev_attr_connect.attr, + &dev_attr_disconnect.attr, + NULL +}; + +const struct attribute_group pci_tsm_attr_group = { + .name = "tsm", + .attrs = pci_tsm_attrs, + .is_visible = SYSFS_GROUP_VISIBLE(pci_tsm), +}; + +static ssize_t authenticated_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + /* + * When the SPDM session established via TSM the 'authenticated' state + * of the device is identical to the connect state. + */ + return connect_show(dev, attr, buf); +} +static DEVICE_ATTR_RO(authenticated); + +static struct attribute *pci_tsm_auth_attrs[] = { + &dev_attr_authenticated.attr, + NULL +}; + +const struct attribute_group pci_tsm_auth_attr_group = { + .attrs = pci_tsm_auth_attrs, + .is_visible = SYSFS_GROUP_VISIBLE(pci_tsm_link), +}; + +/* + * Retrieve physical function0 device whether it has TEE capability or not + */ +static struct pci_dev *pf0_dev_get(struct pci_dev *pdev) +{ + struct pci_dev *pf_dev = pci_physfn(pdev); + + if (PCI_FUNC(pf_dev->devfn) == 0) + return pci_dev_get(pf_dev); + + return pci_get_slot(pf_dev->bus, + pf_dev->devfn - PCI_FUNC(pf_dev->devfn)); +} + +/* + * Find the PCI Device instance that serves as the Device Security Manager (DSM) + * for @pdev. Note that no additional reference is held for the resulting device + * because that resulting object always has a registered lifetime + * greater-than-or-equal to that of the @pdev argument. This is by virtue of + * @pdev being a descendant of, or identical to, the returned DSM device. + */ +static struct pci_dev *find_dsm_dev(struct pci_dev *pdev) +{ + struct device *grandparent; + struct pci_dev *uport; + + if (is_pci_tsm_pf0(pdev)) + return pdev; + + struct pci_dev *pf0 __free(pci_dev_put) = pf0_dev_get(pdev); + if (!pf0) + return NULL; + + if (is_dsm(pf0)) + return pf0; + + /* + * For cases where a switch may be hosting TDISP services on behalf of + * downstream devices, check the first upstream port relative to this + * endpoint. + */ + if (!pdev->dev.parent) + return NULL; + grandparent = pdev->dev.parent->parent; + if (!grandparent) + return NULL; + if (!dev_is_pci(grandparent)) + return NULL; + uport = to_pci_dev(grandparent); + if (!pci_is_pcie(uport) || + pci_pcie_type(uport) != PCI_EXP_TYPE_UPSTREAM) + return NULL; + + if (is_dsm(uport)) + return uport; + return NULL; +} + +/** + * pci_tsm_link_constructor() - base 'struct pci_tsm' initialization for link TSMs + * @pdev: The PCI device + * @tsm: context to initialize + * @tsm_dev: Platform TEE Security Manager, initiator of security operations + */ +int pci_tsm_link_constructor(struct pci_dev *pdev, struct pci_tsm *tsm, + struct tsm_dev *tsm_dev) +{ + if (!is_link_tsm(tsm_dev)) + return -EINVAL; + + tsm->dsm_dev = find_dsm_dev(pdev); + if (!tsm->dsm_dev) { + pci_warn(pdev, "failed to find Device Security Manager\n"); + return -ENXIO; + } + tsm->pdev = pdev; + tsm->tsm_dev = tsm_dev; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_tsm_link_constructor); + +/** + * pci_tsm_pf0_constructor() - common 'struct pci_tsm_pf0' (DSM) initialization + * @pdev: Physical Function 0 PCI device (as indicated by is_pci_tsm_pf0()) + * @tsm: context to initialize + * @tsm_dev: Platform TEE Security Manager, initiator of security operations + */ +int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm, + struct tsm_dev *tsm_dev) +{ + mutex_init(&tsm->lock); + tsm->doe_mb = pci_find_doe_mailbox(pdev, PCI_VENDOR_ID_PCI_SIG, + PCI_DOE_FEATURE_CMA); + if (!tsm->doe_mb) { + pci_warn(pdev, "TSM init failure, no CMA mailbox\n"); + return -ENODEV; + } + + return pci_tsm_link_constructor(pdev, &tsm->base_tsm, tsm_dev); +} +EXPORT_SYMBOL_GPL(pci_tsm_pf0_constructor); + +void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *pf0_tsm) +{ + mutex_destroy(&pf0_tsm->lock); +} +EXPORT_SYMBOL_GPL(pci_tsm_pf0_destructor); + +static void pf0_sysfs_enable(struct pci_dev *pdev) +{ + bool tee = has_tee(pdev); + + pci_dbg(pdev, "Device Security Manager detected (%s%s%s)\n", + pdev->ide_cap ? "IDE" : "", pdev->ide_cap && tee ? " " : "", + tee ? "TEE" : ""); + + sysfs_update_group(&pdev->dev.kobj, &pci_tsm_auth_attr_group); + sysfs_update_group(&pdev->dev.kobj, &pci_tsm_attr_group); +} + +int pci_tsm_register(struct tsm_dev *tsm_dev) +{ + struct pci_dev *pdev = NULL; + + if (!tsm_dev) + return -EINVAL; + + /* The TSM device must only implement one of link_ops or devsec_ops */ + if (!is_link_tsm(tsm_dev) && !is_devsec_tsm(tsm_dev)) + return -EINVAL; + + if (is_link_tsm(tsm_dev) && is_devsec_tsm(tsm_dev)) + return -EINVAL; + + guard(rwsem_write)(&pci_tsm_rwsem); + + /* On first enable, update sysfs groups */ + if (is_link_tsm(tsm_dev) && pci_tsm_link_count++ == 0) { + for_each_pci_dev(pdev) + if (is_pci_tsm_pf0(pdev)) + pf0_sysfs_enable(pdev); + } else if (is_devsec_tsm(tsm_dev)) { + pci_tsm_devsec_count++; + } + + return 0; +} + +static void pci_tsm_fn_exit(struct pci_dev *pdev) +{ + /* TODO: unbind the fn */ + tsm_remove(pdev->tsm); +} + +/** + * __pci_tsm_destroy() - destroy the TSM context for @pdev + * @pdev: device to cleanup + * @tsm_dev: the TSM device being removed, or NULL if @pdev is being removed. + * + * At device removal or TSM unregistration all established context + * with the TSM is torn down. Additionally, if there are no more TSMs + * registered, the PCI tsm/ sysfs attributes are hidden. + */ +static void __pci_tsm_destroy(struct pci_dev *pdev, struct tsm_dev *tsm_dev) +{ + struct pci_tsm *tsm = pdev->tsm; + + lockdep_assert_held_write(&pci_tsm_rwsem); + + /* + * First, handle the TSM removal case to shutdown @pdev sysfs, this is + * skipped if the device itself is being removed since sysfs goes away + * naturally at that point + */ + if (is_link_tsm(tsm_dev) && is_pci_tsm_pf0(pdev) && !pci_tsm_link_count) { + sysfs_update_group(&pdev->dev.kobj, &pci_tsm_auth_attr_group); + sysfs_update_group(&pdev->dev.kobj, &pci_tsm_attr_group); + } + + /* Nothing else to do if this device never attached to the departing TSM */ + if (!tsm) + return; + + /* Now lookup the tsm_dev to destroy TSM context */ + if (!tsm_dev) + tsm_dev = tsm->tsm_dev; + else if (tsm_dev != tsm->tsm_dev) + return; + + if (is_link_tsm(tsm_dev) && is_pci_tsm_pf0(pdev)) + pci_tsm_disconnect(pdev); + else + pci_tsm_fn_exit(pdev); +} + +void pci_tsm_destroy(struct pci_dev *pdev) +{ + guard(rwsem_write)(&pci_tsm_rwsem); + __pci_tsm_destroy(pdev, NULL); +} + +void pci_tsm_init(struct pci_dev *pdev) +{ + guard(rwsem_read)(&pci_tsm_rwsem); + + /* + * Subfunctions are either probed synchronous with connect() or later + * when either the SR-IOV configuration is changed, or, unlikely, + * connect() raced initial bus scanning. + */ + if (pdev->tsm) + return; + + if (pci_tsm_link_count) { + struct pci_dev *dsm = find_dsm_dev(pdev); + + if (!dsm) + return; + + /* + * The only path to init a Device Security Manager capable + * device is via connect(). + */ + if (!dsm->tsm) + return; + + probe_fn(pdev, dsm); + } +} + +void pci_tsm_unregister(struct tsm_dev *tsm_dev) +{ + struct pci_dev *pdev = NULL; + + guard(rwsem_write)(&pci_tsm_rwsem); + if (is_link_tsm(tsm_dev)) + pci_tsm_link_count--; + if (is_devsec_tsm(tsm_dev)) + pci_tsm_devsec_count--; + for_each_pci_dev_reverse(pdev) + __pci_tsm_destroy(pdev, tsm_dev); +} + +int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req, + size_t req_sz, void *resp, size_t resp_sz) +{ + struct pci_tsm_pf0 *tsm; + + if (!pdev->tsm || !is_pci_tsm_pf0(pdev)) + return -ENXIO; + + tsm = to_pci_tsm_pf0(pdev->tsm); + if (!tsm->doe_mb) + return -ENXIO; + + return pci_doe(tsm->doe_mb, PCI_VENDOR_ID_PCI_SIG, type, req, req_sz, + resp, resp_sz); +} +EXPORT_SYMBOL_GPL(pci_tsm_doe_transfer); diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c index 347507cc5e3f..0e705f3067a1 100644 --- a/drivers/virt/coco/tsm-core.c +++ b/drivers/virt/coco/tsm-core.c @@ -8,11 +8,29 @@ #include #include #include +#include static struct class *tsm_class; static DECLARE_RWSEM(tsm_rwsem); static DEFINE_IDA(tsm_ida); +static int match_id(struct device *dev, const void *data) +{ + struct tsm_dev *tsm_dev = container_of(dev, struct tsm_dev, dev); + int id = *(const int *)data; + + return tsm_dev->id == id; +} + +struct tsm_dev *find_tsm_dev(int id) +{ + struct device *dev = class_find_device(tsm_class, NULL, &id, match_id); + + if (!dev) + return NULL; + return container_of(dev, struct tsm_dev, dev); +} + static struct tsm_dev *alloc_tsm_dev(struct device *parent) { struct device *dev; @@ -36,7 +54,29 @@ static struct tsm_dev *alloc_tsm_dev(struct device *parent) return no_free_ptr(tsm_dev); } -struct tsm_dev *tsm_register(struct device *parent) +static struct tsm_dev *tsm_register_pci_or_reset(struct tsm_dev *tsm_dev, + struct pci_tsm_ops *pci_ops) +{ + int rc; + + if (!pci_ops) + return tsm_dev; + + tsm_dev->pci_ops = pci_ops; + rc = pci_tsm_register(tsm_dev); + if (rc) { + dev_err(tsm_dev->dev.parent, + "PCI/TSM registration failure: %d\n", rc); + device_unregister(&tsm_dev->dev); + return ERR_PTR(rc); + } + + /* Notify TSM userspace that PCI/TSM operations are now possible */ + kobject_uevent(&tsm_dev->dev.kobj, KOBJ_CHANGE); + return tsm_dev; +} + +struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *pci_ops) { struct tsm_dev *tsm_dev __free(put_tsm_dev) = alloc_tsm_dev(parent); struct device *dev; @@ -54,12 +94,14 @@ struct tsm_dev *tsm_register(struct device *parent) if (rc) return ERR_PTR(rc); - return no_free_ptr(tsm_dev); + return tsm_register_pci_or_reset(no_free_ptr(tsm_dev), pci_ops); } EXPORT_SYMBOL_GPL(tsm_register); void tsm_unregister(struct tsm_dev *tsm_dev) { + if (tsm_dev->pci_ops) + pci_tsm_unregister(tsm_dev); device_unregister(&tsm_dev->dev); } EXPORT_SYMBOL_GPL(tsm_unregister); diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h index 1f14aed4354b..bd4346a7c4e7 100644 --- a/include/linux/pci-doe.h +++ b/include/linux/pci-doe.h @@ -15,6 +15,10 @@ struct pci_doe_mb; +#define PCI_DOE_FEATURE_DISCOVERY 0 +#define PCI_DOE_FEATURE_CMA 1 +#define PCI_DOE_FEATURE_SSESSION 2 + struct pci_doe_mb *pci_find_doe_mailbox(struct pci_dev *pdev, u16 vendor, u8 type); diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h new file mode 100644 index 000000000000..e921d30f9b6c --- /dev/null +++ b/include/linux/pci-tsm.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PCI_TSM_H +#define __PCI_TSM_H +#include +#include + +struct pci_tsm; +struct tsm_dev; + +/* + * struct pci_tsm_ops - manage confidential links and security state + * @link_ops: Coordinate PCIe SPDM and IDE establishment via a platform TSM. + * Provide a secure session transport for TDISP state management + * (typically bare metal physical function operations). + * @devsec_ops: Lock, unlock, and interrogate the security state of the + * function via the platform TSM (typically virtual function + * operations). + * + * This operations are mutually exclusive either a tsm_dev instance + * manages physical link properties or it manages function security + * states like TDISP lock/unlock. + */ +struct pci_tsm_ops { + /* + * struct pci_tsm_link_ops - Manage physical link and the TSM/DSM session + * @probe: establish context with the TSM (allocate / wrap 'struct + * pci_tsm') for follow-on link operations + * @remove: destroy link operations context + * @connect: establish / validate a secure connection (e.g. IDE) + * with the device + * @disconnect: teardown the secure link + * + * Context: @probe, @remove, @connect, and @disconnect run under + * pci_tsm_rwsem held for write to sync with TSM unregistration and + * mutual exclusion of @connect and @disconnect. @connect and + * @disconnect additionally run under the DSM lock (struct + * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions. + */ + struct_group_tagged(pci_tsm_link_ops, link_ops, + struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev, + struct pci_dev *pdev); + void (*remove)(struct pci_tsm *tsm); + int (*connect)(struct pci_dev *pdev); + void (*disconnect)(struct pci_dev *pdev); + ); + + /* + * struct pci_tsm_devsec_ops - Manage the security state of the function + * @lock: establish context with the TSM (allocate / wrap 'struct + * pci_tsm') for follow-on security state transitions from the + * LOCKED state + * @unlock: destroy TSM context and return device to UNLOCKED state + * + * Context: @lock and @unlock run under pci_tsm_rwsem held for write to + * sync with TSM unregistration and each other + */ + struct_group_tagged(pci_tsm_devsec_ops, devsec_ops, + struct pci_tsm *(*lock)(struct tsm_dev *tsm_dev, + struct pci_dev *pdev); + void (*unlock)(struct pci_tsm *tsm); + ); +}; + +/** + * struct pci_tsm - Core TSM context for a given PCIe endpoint + * @pdev: Back ref to device function, distinguishes type of pci_tsm context + * @dsm_dev: PCI Device Security Manager for link operations on @pdev + * @tsm_dev: PCI TEE Security Manager device for Link Confidentiality or Device + * Function Security operations + * + * This structure is wrapped by low level TSM driver data and returned by + * probe()/lock(), it is freed by the corresponding remove()/unlock(). + * + * For link operations it serves to cache the association between a Device + * Security Manager (DSM) and the functions that manager can assign to a TVM. + * That can be "self", for assigning function0 of a TEE I/O device, a + * sub-function (SR-IOV virtual function, or non-function0 + * multifunction-device), or a downstream endpoint (PCIe upstream switch-port as + * DSM). + */ +struct pci_tsm { + struct pci_dev *pdev; + struct pci_dev *dsm_dev; + struct tsm_dev *tsm_dev; +}; + +/** + * struct pci_tsm_pf0 - Physical Function 0 TDISP link context + * @base_tsm: generic core "tsm" context + * @lock: mutual exclustion for pci_tsm_ops invocation + * @doe_mb: PCIe Data Object Exchange mailbox + */ +struct pci_tsm_pf0 { + struct pci_tsm base_tsm; + struct mutex lock; + struct pci_doe_mb *doe_mb; +}; + +/* physical function0 and capable of 'connect' */ +static inline bool is_pci_tsm_pf0(struct pci_dev *pdev) +{ + if (!pdev) + return false; + + if (!pci_is_pcie(pdev)) + return false; + + if (pdev->is_virtfn) + return false; + + /* + * Allow for a Device Security Manager (DSM) associated with function0 + * of an Endpoint to coordinate TDISP requests for other functions + * (physical or virtual) of the device, or allow for an Upstream Port + * DSM to accept TDISP requests for the Endpoints downstream of the + * switch. + */ + switch (pci_pcie_type(pdev)) { + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_RC_END: + if (pdev->ide_cap || (pdev->devcap & PCI_EXP_DEVCAP_TEE)) + break; + fallthrough; + default: + return false; + } + + return PCI_FUNC(pdev->devfn) == 0; +} + +#ifdef CONFIG_PCI_TSM +int pci_tsm_register(struct tsm_dev *tsm_dev); +void pci_tsm_unregister(struct tsm_dev *tsm_dev); +int pci_tsm_link_constructor(struct pci_dev *pdev, struct pci_tsm *tsm, + struct tsm_dev *tsm_dev); +int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm, + struct tsm_dev *tsm_dev); +void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *tsm); +int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req, + size_t req_sz, void *resp, size_t resp_sz); +#else +static inline int pci_tsm_register(struct tsm_dev *tsm_dev) +{ + return 0; +} +static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev) +{ +} +static inline int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, + const void *req, size_t req_sz, + void *resp, size_t resp_sz) +{ + return -ENXIO; +} +#endif +#endif /*__PCI_TSM_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index b6a12a82be12..2f9c0cb6a50a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -546,6 +546,9 @@ struct pci_dev { u8 nr_link_ide; /* Link Stream count (Selective Stream offset) */ unsigned int ide_cfg:1; /* Config cycles over IDE */ unsigned int ide_tee_limit:1; /* Disallow T=0 traffic over IDE */ +#endif +#ifdef CONFIG_PCI_TSM + struct pci_tsm *tsm; /* TSM operation state */ #endif u16 acs_cap; /* ACS Capability offset */ u8 supported_speeds; /* Supported Link Speeds Vector */ diff --git a/include/linux/tsm.h b/include/linux/tsm.h index cd97c63ffa32..22e05b2aac69 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -108,9 +108,11 @@ struct tsm_report_ops { bool (*report_bin_attr_visible)(int n); }; +struct pci_tsm_ops; struct tsm_dev { struct device dev; int id; + const struct pci_tsm_ops *pci_ops; }; DEFINE_FREE(put_tsm_dev, struct tsm_dev *, @@ -118,6 +120,7 @@ DEFINE_FREE(put_tsm_dev, struct tsm_dev *, int tsm_report_register(const struct tsm_report_ops *ops, void *priv); int tsm_report_unregister(const struct tsm_report_ops *ops); -struct tsm_dev *tsm_register(struct device *parent); +struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *ops); void tsm_unregister(struct tsm_dev *tsm_dev); +struct tsm_dev *find_tsm_dev(int id); #endif /* __TSM_H */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 05bd22d9e352..f2759c1097bc 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -503,6 +503,7 @@ #define PCI_EXP_DEVCAP_PWR_VAL 0x03fc0000 /* Slot Power Limit Value */ #define PCI_EXP_DEVCAP_PWR_SCL 0x0c000000 /* Slot Power Limit Scale */ #define PCI_EXP_DEVCAP_FLR 0x10000000 /* Function Level Reset */ +#define PCI_EXP_DEVCAP_TEE 0x40000000 /* TEE I/O (TDISP) Support */ #define PCI_EXP_DEVCTL 0x08 /* Device Control */ #define PCI_EXP_DEVCTL_CERE 0x0001 /* Correctable Error Reporting En. */ #define PCI_EXP_DEVCTL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */ -- cgit v1.2.3 From c0c1262fbfbafe943dbccd5f97b500b72dbd2205 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:57 -0700 Subject: PCI: Add PCIe Device 3 Extended Capability enumeration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe r7.0 Section 7.7.9 Device 3 Extended Capability Structure, defines the canonical location for determining the Flit Mode of a device. This status is a dependency for PCIe IDE enabling. Add a new fm_enabled flag to 'struct pci_dev'. Cc: Lukas Wunner Cc: Ilpo Järvinen Cc: Bjorn Helgaas Cc: Samuel Ortiz Cc: Alexey Kardashevskiy Cc: Xu Yilun Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-6-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/probe.c | 12 ++++++++++++ include/linux/pci.h | 1 + include/uapi/linux/pci_regs.h | 7 +++++++ 3 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index d1467348c169..3b54f1720be5 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2283,6 +2283,17 @@ int pci_configure_extended_tags(struct pci_dev *dev, void *ign) return 0; } +static void pci_dev3_init(struct pci_dev *pdev) +{ + u16 cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DEV3); + u32 val = 0; + + if (!cap) + return; + pci_read_config_dword(pdev, cap + PCI_DEV3_STA, &val); + pdev->fm_enabled = !!(val & PCI_DEV3_STA_SEGMENT); +} + /** * pcie_relaxed_ordering_enabled - Probe for PCIe relaxed ordering enable * @dev: PCI device to query @@ -2667,6 +2678,7 @@ static void pci_init_capabilities(struct pci_dev *dev) pci_doe_init(dev); /* Data Object Exchange */ pci_tph_init(dev); /* TLP Processing Hints */ pci_rebar_init(dev); /* Resizable BAR */ + pci_dev3_init(dev); /* Device 3 capabilities */ pci_ide_init(dev); /* Link Integrity and Data Encryption */ pcie_report_downtraining(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 2f9c0cb6a50a..ea94799c81b0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -450,6 +450,7 @@ struct pci_dev { unsigned int pasid_enabled:1; /* Process Address Space ID */ unsigned int pri_enabled:1; /* Page Request Interface */ unsigned int tph_enabled:1; /* TLP Processing Hints */ + unsigned int fm_enabled:1; /* Flit Mode (segment captured) */ unsigned int is_managed:1; /* Managed via devres */ unsigned int is_msi_managed:1; /* MSI release via devres installed */ unsigned int needs_freset:1; /* Requires fundamental reset */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f2759c1097bc..3add74ae2594 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -755,6 +755,7 @@ #define PCI_EXT_CAP_ID_NPEM 0x29 /* Native PCIe Enclosure Management */ #define PCI_EXT_CAP_ID_PL_32GT 0x2A /* Physical Layer 32.0 GT/s */ #define PCI_EXT_CAP_ID_DOE 0x2E /* Data Object Exchange */ +#define PCI_EXT_CAP_ID_DEV3 0x2F /* Device 3 Capability/Control/Status */ #define PCI_EXT_CAP_ID_IDE 0x30 /* Integrity and Data Encryption */ #define PCI_EXT_CAP_ID_PL_64GT 0x31 /* Physical Layer 64.0 GT/s */ #define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_64GT @@ -1246,6 +1247,12 @@ /* Deprecated old name, replaced with PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE */ #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE +/* Device 3 Extended Capability */ +#define PCI_DEV3_CAP 0x04 /* Device 3 Capabilities Register */ +#define PCI_DEV3_CTL 0x08 /* Device 3 Control Register */ +#define PCI_DEV3_STA 0x0c /* Device 3 Status Register */ +#define PCI_DEV3_STA_SEGMENT 0x8 /* Segment Captured (end-to-end flit-mode detected) */ + /* Compute Express Link (CXL r3.1, sec 8.1.5) */ #define PCI_DVSEC_CXL_PORT 3 #define PCI_DVSEC_CXL_PORT_CTL 0x0c -- cgit v1.2.3 From 1e4d2ff3ae450dab37b5b5726c3f7df3e60d6e89 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:59 -0700 Subject: PCI/IDE: Add IDE establishment helpers There are two components to establishing an encrypted link, provisioning the stream in Partner Port config-space, and programming the keys into the link layer via IDE_KM (IDE Key Management). This new library, drivers/pci/ide.c, enables the former. IDE_KM, via a TSM low-level driver, is saved for later. With the platform TSM implementations of SEV-TIO and TDX Connect in mind this library abstracts small differences in those implementations. For example, TDX Connect handles Root Port register setup while SEV-TIO expects System Software to update the Root Port registers. This is the rationale for fine-grained 'setup' + 'enable' verbs. The other design detail for TSM-coordinated IDE establishment is that the TSM may manage allocation of Stream IDs, this is why the Stream ID value is passed in to pci_ide_stream_setup(). The flow is: pci_ide_stream_alloc(): Allocate a Selective IDE Stream Register Block in each Partner Port (Endpoint + Root Port), and reserve a host bridge / platform stream slot. Gather Partner Port specific stream settings like Requester ID. pci_ide_stream_register(): Publish the stream in sysfs after allocating a Stream ID. In the TSM case the TSM allocates the Stream ID for the Partner Port pair. pci_ide_stream_setup(): Program the stream settings to a Partner Port. Caller is responsible for optionally calling this for the Root Port as well if the TSM implementation requires it. pci_ide_stream_enable(): Enable the stream after IDE_KM. In support of system administrators auditing where platform, Root Port, and Endpoint IDE stream resources are being spent, the allocated stream is reflected as a symlink from the host bridge to the endpoint with the name: stream%d.%d.%d Where the tuple of integers reflects the allocated platform, Root Port, and Endpoint stream index (Selective IDE Stream Register Block) values. Thanks to Wu Hao for a draft implementation of this infrastructure. Cc: Bjorn Helgaas Cc: Lukas Wunner Cc: Samuel Ortiz Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-8-dan.j.williams@intel.com Signed-off-by: Dan Williams --- .../ABI/testing/sysfs-devices-pci-host-bridge | 14 + drivers/pci/ide.c | 428 +++++++++++++++++++++ drivers/pci/pci.h | 2 + drivers/pci/probe.c | 1 + include/linux/pci-ide.h | 78 ++++ include/linux/pci.h | 6 + 6 files changed, 529 insertions(+) create mode 100644 include/linux/pci-ide.h (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge index 8c3a652799f1..2c66e5bb2bf8 100644 --- a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge +++ b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge @@ -17,3 +17,17 @@ Description: PNP0A08 (/sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A08:00). See /sys/devices/pciDDDD:BB entry for details about the DDDD:BB format. + +What: pciDDDD:BB/streamH.R.E +Contact: linux-pci@vger.kernel.org +Description: + (RO) When a platform has established a secure connection, PCIe + IDE, between two Partner Ports, this symlink appears. A stream + consumes a Stream ID slot in each of the Host bridge (H), Root + Port (R) and Endpoint (E). The link points to the Endpoint PCI + device in the Selective IDE Stream pairing. Specifically, "R" + and "E" represent the assigned Selective IDE Stream Register + Block in the Root Port and Endpoint, and "H" represents a + platform specific pool of stream resources shared by the Root + Ports in a host bridge. See /sys/devices/pciDDDD:BB entry for + details about the DDDD:BB format. diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c index 26866edf91b4..7643840738fe 100644 --- a/drivers/pci/ide.c +++ b/drivers/pci/ide.c @@ -5,8 +5,12 @@ #define dev_fmt(fmt) "PCI/IDE: " fmt #include +#include #include +#include #include +#include +#include #include "pci.h" @@ -23,12 +27,25 @@ static int __sel_ide_offset(u16 ide_cap, u8 nr_link_ide, u8 stream_index, return offset + stream_index * PCI_IDE_SEL_BLOCK_SIZE(nr_ide_mem); } +static int sel_ide_offset(struct pci_dev *pdev, + struct pci_ide_partner *settings) +{ + return __sel_ide_offset(pdev->ide_cap, pdev->nr_link_ide, + settings->stream_index, pdev->nr_ide_mem); +} + void pci_ide_init(struct pci_dev *pdev) { u16 nr_link_ide, nr_ide_mem, nr_streams; u16 ide_cap; u32 val; + /* + * Unconditionally init so that ida idle state is consistent with + * pdev->ide_cap. + */ + ida_init(&pdev->ide_stream_ida); + if (!pci_is_pcie(pdev)) return; @@ -84,5 +101,416 @@ void pci_ide_init(struct pci_dev *pdev) pdev->ide_cap = ide_cap; pdev->nr_link_ide = nr_link_ide; + pdev->nr_sel_ide = nr_streams; pdev->nr_ide_mem = nr_ide_mem; } + +struct stream_index { + struct ida *ida; + u8 stream_index; +}; + +static void free_stream_index(struct stream_index *stream) +{ + ida_free(stream->ida, stream->stream_index); +} + +DEFINE_FREE(free_stream, struct stream_index *, if (_T) free_stream_index(_T)) +static struct stream_index *alloc_stream_index(struct ida *ida, u16 max, + struct stream_index *stream) +{ + int id; + + if (!max) + return NULL; + + id = ida_alloc_max(ida, max - 1, GFP_KERNEL); + if (id < 0) + return NULL; + + *stream = (struct stream_index) { + .ida = ida, + .stream_index = id, + }; + return stream; +} + +/** + * pci_ide_stream_alloc() - Reserve stream indices and probe for settings + * @pdev: IDE capable PCIe Endpoint Physical Function + * + * Retrieve the Requester ID range of @pdev for programming its Root + * Port IDE RID Association registers, and conversely retrieve the + * Requester ID of the Root Port for programming @pdev's IDE RID + * Association registers. + * + * Allocate a Selective IDE Stream Register Block instance per port. + * + * Allocate a platform stream resource from the associated host bridge. + * Retrieve stream association parameters for Requester ID range and + * address range restrictions for the stream. + */ +struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev) +{ + /* EP, RP, + HB Stream allocation */ + struct stream_index __stream[PCI_IDE_HB + 1]; + struct pci_host_bridge *hb; + struct pci_dev *rp; + int num_vf, rid_end; + + if (!pci_is_pcie(pdev)) + return NULL; + + if (pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) + return NULL; + + if (!pdev->ide_cap) + return NULL; + + struct pci_ide *ide __free(kfree) = kzalloc(sizeof(*ide), GFP_KERNEL); + if (!ide) + return NULL; + + hb = pci_find_host_bridge(pdev->bus); + struct stream_index *hb_stream __free(free_stream) = alloc_stream_index( + &hb->ide_stream_ida, hb->nr_ide_streams, &__stream[PCI_IDE_HB]); + if (!hb_stream) + return NULL; + + rp = pcie_find_root_port(pdev); + struct stream_index *rp_stream __free(free_stream) = alloc_stream_index( + &rp->ide_stream_ida, rp->nr_sel_ide, &__stream[PCI_IDE_RP]); + if (!rp_stream) + return NULL; + + struct stream_index *ep_stream __free(free_stream) = alloc_stream_index( + &pdev->ide_stream_ida, pdev->nr_sel_ide, &__stream[PCI_IDE_EP]); + if (!ep_stream) + return NULL; + + /* for SR-IOV case, cover all VFs */ + num_vf = pci_num_vf(pdev); + if (num_vf) + rid_end = PCI_DEVID(pci_iov_virtfn_bus(pdev, num_vf), + pci_iov_virtfn_devfn(pdev, num_vf)); + else + rid_end = pci_dev_id(pdev); + + *ide = (struct pci_ide) { + .pdev = pdev, + .partner = { + [PCI_IDE_EP] = { + .rid_start = pci_dev_id(rp), + .rid_end = pci_dev_id(rp), + .stream_index = no_free_ptr(ep_stream)->stream_index, + }, + [PCI_IDE_RP] = { + .rid_start = pci_dev_id(pdev), + .rid_end = rid_end, + .stream_index = no_free_ptr(rp_stream)->stream_index, + }, + }, + .host_bridge_stream = no_free_ptr(hb_stream)->stream_index, + .stream_id = -1, + }; + + return_ptr(ide); +} +EXPORT_SYMBOL_GPL(pci_ide_stream_alloc); + +/** + * pci_ide_stream_free() - unwind pci_ide_stream_alloc() + * @ide: idle IDE settings descriptor + * + * Free all of the stream index (register block) allocations acquired by + * pci_ide_stream_alloc(). The stream represented by @ide is assumed to + * be unregistered and not instantiated in any device. + */ +void pci_ide_stream_free(struct pci_ide *ide) +{ + struct pci_dev *pdev = ide->pdev; + struct pci_dev *rp = pcie_find_root_port(pdev); + struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus); + + ida_free(&pdev->ide_stream_ida, ide->partner[PCI_IDE_EP].stream_index); + ida_free(&rp->ide_stream_ida, ide->partner[PCI_IDE_RP].stream_index); + ida_free(&hb->ide_stream_ida, ide->host_bridge_stream); + kfree(ide); +} +EXPORT_SYMBOL_GPL(pci_ide_stream_free); + +/** + * pci_ide_stream_release() - unwind and release an @ide context + * @ide: partially or fully registered IDE settings descriptor + * + * In support of automatic cleanup of IDE setup routines perform IDE + * teardown in expected reverse order of setup and with respect to which + * aspects of IDE setup have successfully completed. + * + * Be careful that setup order mirrors this shutdown order. Otherwise, + * open code releasing the IDE context. + */ +void pci_ide_stream_release(struct pci_ide *ide) +{ + struct pci_dev *pdev = ide->pdev; + struct pci_dev *rp = pcie_find_root_port(pdev); + + if (ide->partner[PCI_IDE_RP].enable) + pci_ide_stream_disable(rp, ide); + + if (ide->partner[PCI_IDE_EP].enable) + pci_ide_stream_disable(pdev, ide); + + if (ide->partner[PCI_IDE_RP].setup) + pci_ide_stream_teardown(rp, ide); + + if (ide->partner[PCI_IDE_EP].setup) + pci_ide_stream_teardown(pdev, ide); + + if (ide->name) + pci_ide_stream_unregister(ide); + + pci_ide_stream_free(ide); +} +EXPORT_SYMBOL_GPL(pci_ide_stream_release); + +/** + * pci_ide_stream_register() - Prepare to activate an IDE Stream + * @ide: IDE settings descriptor + * + * After a Stream ID has been acquired for @ide, record the presence of + * the stream in sysfs. The expectation is that @ide is immutable while + * registered. + */ +int pci_ide_stream_register(struct pci_ide *ide) +{ + struct pci_dev *pdev = ide->pdev; + struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus); + u8 ep_stream, rp_stream; + int rc; + + if (ide->stream_id < 0 || ide->stream_id > U8_MAX) { + pci_err(pdev, "Setup fail: Invalid Stream ID: %d\n", ide->stream_id); + return -ENXIO; + } + + ep_stream = ide->partner[PCI_IDE_EP].stream_index; + rp_stream = ide->partner[PCI_IDE_RP].stream_index; + const char *name __free(kfree) = kasprintf(GFP_KERNEL, "stream%d.%d.%d", + ide->host_bridge_stream, + rp_stream, ep_stream); + if (!name) + return -ENOMEM; + + rc = sysfs_create_link(&hb->dev.kobj, &pdev->dev.kobj, name); + if (rc) + return rc; + + ide->name = no_free_ptr(name); + + return 0; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_register); + +/** + * pci_ide_stream_unregister() - unwind pci_ide_stream_register() + * @ide: idle IDE settings descriptor + * + * In preparation for freeing @ide, remove sysfs enumeration for the + * stream. + */ +void pci_ide_stream_unregister(struct pci_ide *ide) +{ + struct pci_dev *pdev = ide->pdev; + struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus); + + sysfs_remove_link(&hb->dev.kobj, ide->name); + kfree(ide->name); + ide->name = NULL; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_unregister); + +static int pci_ide_domain(struct pci_dev *pdev) +{ + if (pdev->fm_enabled) + return pci_domain_nr(pdev->bus); + return 0; +} + +struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, struct pci_ide *ide) +{ + if (!pci_is_pcie(pdev)) { + pci_warn_once(pdev, "not a PCIe device\n"); + return NULL; + } + + switch (pci_pcie_type(pdev)) { + case PCI_EXP_TYPE_ENDPOINT: + if (pdev != ide->pdev) { + pci_warn_once(pdev, "setup expected Endpoint: %s\n", pci_name(ide->pdev)); + return NULL; + } + return &ide->partner[PCI_IDE_EP]; + case PCI_EXP_TYPE_ROOT_PORT: { + struct pci_dev *rp = pcie_find_root_port(ide->pdev); + + if (pdev != rp) { + pci_warn_once(pdev, "setup expected Root Port: %s\n", + pci_name(rp)); + return NULL; + } + return &ide->partner[PCI_IDE_RP]; + } + default: + pci_warn_once(pdev, "invalid device type\n"); + return NULL; + } +} +EXPORT_SYMBOL_GPL(pci_ide_to_settings); + +static void set_ide_sel_ctl(struct pci_dev *pdev, struct pci_ide *ide, + struct pci_ide_partner *settings, int pos, + bool enable) +{ + u32 val = FIELD_PREP(PCI_IDE_SEL_CTL_ID, ide->stream_id) | + FIELD_PREP(PCI_IDE_SEL_CTL_DEFAULT, settings->default_stream) | + FIELD_PREP(PCI_IDE_SEL_CTL_CFG_EN, pdev->ide_cfg) | + FIELD_PREP(PCI_IDE_SEL_CTL_TEE_LIMITED, pdev->ide_tee_limit) | + FIELD_PREP(PCI_IDE_SEL_CTL_EN, enable); + + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, val); +} + +/** + * pci_ide_stream_setup() - program settings to Selective IDE Stream registers + * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port + * @ide: registered IDE settings descriptor + * + * When @pdev is a PCI_EXP_TYPE_ENDPOINT then the PCI_IDE_EP partner + * settings are written to @pdev's Selective IDE Stream register block, + * and when @pdev is a PCI_EXP_TYPE_ROOT_PORT, the PCI_IDE_RP settings + * are selected. + */ +void pci_ide_stream_setup(struct pci_dev *pdev, struct pci_ide *ide) +{ + struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + int pos; + u32 val; + + if (!settings) + return; + + pos = sel_ide_offset(pdev, settings); + + val = FIELD_PREP(PCI_IDE_SEL_RID_1_LIMIT, settings->rid_end); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, val); + + val = FIELD_PREP(PCI_IDE_SEL_RID_2_VALID, 1) | + FIELD_PREP(PCI_IDE_SEL_RID_2_BASE, settings->rid_start) | + FIELD_PREP(PCI_IDE_SEL_RID_2_SEG, pci_ide_domain(pdev)); + + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, val); + + /* + * Setup control register early for devices that expect + * stream_id is set during key programming. + */ + set_ide_sel_ctl(pdev, ide, settings, pos, false); + settings->setup = 1; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_setup); + +/** + * pci_ide_stream_teardown() - disable the stream and clear all settings + * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port + * @ide: registered IDE settings descriptor + * + * For stream destruction, zero all registers that may have been written + * by pci_ide_stream_setup(). Consider pci_ide_stream_disable() to leave + * settings in place while temporarily disabling the stream. + */ +void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide) +{ + struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + int pos; + + if (!settings) + return; + + pos = sel_ide_offset(pdev, settings); + + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, 0); + settings->setup = 0; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_teardown); + +/** + * pci_ide_stream_enable() - enable a Selective IDE Stream + * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port + * @ide: registered and setup IDE settings descriptor + * + * Activate the stream by writing to the Selective IDE Stream Control + * Register. + * + * Return: 0 if the stream successfully entered the "secure" state, and -EINVAL + * if @ide is invalid, and -ENXIO if the stream fails to enter the secure state. + * + * Note that the state may go "insecure" at any point after returning 0, but + * those events are equivalent to a "link down" event and handled via + * asynchronous error reporting. + * + * Caller is responsible to clear the enable bit in the -ENXIO case. + */ +int pci_ide_stream_enable(struct pci_dev *pdev, struct pci_ide *ide) +{ + struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + int pos; + u32 val; + + if (!settings) + return -EINVAL; + + pos = sel_ide_offset(pdev, settings); + + set_ide_sel_ctl(pdev, ide, settings, pos, true); + settings->enable = 1; + + pci_read_config_dword(pdev, pos + PCI_IDE_SEL_STS, &val); + if (FIELD_GET(PCI_IDE_SEL_STS_STATE, val) != + PCI_IDE_SEL_STS_STATE_SECURE) + return -ENXIO; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_enable); + +/** + * pci_ide_stream_disable() - disable a Selective IDE Stream + * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port + * @ide: registered and setup IDE settings descriptor + * + * Clear the Selective IDE Stream Control Register, but leave all other + * registers untouched. + */ +void pci_ide_stream_disable(struct pci_dev *pdev, struct pci_ide *ide) +{ + struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + int pos; + + if (!settings) + return; + + pos = sel_ide_offset(pdev, settings); + + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, 0); + settings->enable = 0; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_disable); + +void pci_ide_init_host_bridge(struct pci_host_bridge *hb) +{ + hb->nr_ide_streams = 256; + ida_init(&hb->ide_stream_ida); +} diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 6e4cc1c9aa58..d3f16be40102 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -615,8 +615,10 @@ static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { } #ifdef CONFIG_PCI_IDE void pci_ide_init(struct pci_dev *dev); +void pci_ide_init_host_bridge(struct pci_host_bridge *hb); #else static inline void pci_ide_init(struct pci_dev *dev) { } +static inline void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { } #endif #ifdef CONFIG_PCI_TSM diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 3b54f1720be5..93fa7ba8dfa6 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -672,6 +672,7 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge) bridge->native_dpc = 1; bridge->domain_nr = PCI_DOMAIN_NR_NOT_SET; bridge->native_cxl_error = 1; + pci_ide_init_host_bridge(bridge); device_initialize(&bridge->dev); } diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h new file mode 100644 index 000000000000..e638f9429bf9 --- /dev/null +++ b/include/linux/pci-ide.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common helpers for drivers (e.g. low-level PCI/TSM drivers) implementing the + * IDE key management protocol (IDE_KM) as defined by: + * PCIe r7.0 section 6.33 Integrity & Data Encryption (IDE) + * + * Copyright(c) 2024-2025 Intel Corporation. All rights reserved. + */ + +#ifndef __PCI_IDE_H__ +#define __PCI_IDE_H__ + +enum pci_ide_partner_select { + PCI_IDE_EP, + PCI_IDE_RP, + PCI_IDE_PARTNER_MAX, + /* + * In addition to the resources in each partner port the + * platform / host-bridge additionally has a Stream ID pool that + * it shares across root ports. Let pci_ide_stream_alloc() use + * the alloc_stream_index() helper as endpoints and root ports. + */ + PCI_IDE_HB = PCI_IDE_PARTNER_MAX, +}; + +/** + * struct pci_ide_partner - Per port pair Selective IDE Stream settings + * @rid_start: Partner Port Requester ID range start + * @rid_end: Partner Port Requester ID range end + * @stream_index: Selective IDE Stream Register Block selection + * @default_stream: Endpoint uses this stream for all upstream TLPs regardless of + * address and RID association registers + * @setup: flag to track whether to run pci_ide_stream_teardown() for this + * partner slot + * @enable: flag whether to run pci_ide_stream_disable() for this partner slot + */ +struct pci_ide_partner { + u16 rid_start; + u16 rid_end; + u8 stream_index; + unsigned int default_stream:1; + unsigned int setup:1; + unsigned int enable:1; +}; + +/** + * struct pci_ide - PCIe Selective IDE Stream descriptor + * @pdev: PCIe Endpoint in the pci_ide_partner pair + * @partner: per-partner settings + * @host_bridge_stream: allocated from host bridge @ide_stream_ida pool + * @stream_id: unique Stream ID (within Partner Port pairing) + * @name: name of the established Selective IDE Stream in sysfs + * + * Negative @stream_id values indicate "uninitialized" on the + * expectation that with TSM established IDE the TSM owns the stream_id + * allocation. + */ +struct pci_ide { + struct pci_dev *pdev; + struct pci_ide_partner partner[PCI_IDE_PARTNER_MAX]; + u8 host_bridge_stream; + int stream_id; + const char *name; +}; + +struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, + struct pci_ide *ide); +struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev); +void pci_ide_stream_free(struct pci_ide *ide); +int pci_ide_stream_register(struct pci_ide *ide); +void pci_ide_stream_unregister(struct pci_ide *ide); +void pci_ide_stream_setup(struct pci_dev *pdev, struct pci_ide *ide); +void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide); +int pci_ide_stream_enable(struct pci_dev *pdev, struct pci_ide *ide); +void pci_ide_stream_disable(struct pci_dev *pdev, struct pci_ide *ide); +void pci_ide_stream_release(struct pci_ide *ide); +DEFINE_FREE(pci_ide_stream_release, struct pci_ide *, if (_T) pci_ide_stream_release(_T)) +#endif /* __PCI_IDE_H__ */ diff --git a/include/linux/pci.h b/include/linux/pci.h index ea94799c81b0..2c8dbae4916c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -545,6 +545,8 @@ struct pci_dev { u16 ide_cap; /* Link Integrity & Data Encryption */ u8 nr_ide_mem; /* Address association resources for streams */ u8 nr_link_ide; /* Link Stream count (Selective Stream offset) */ + u16 nr_sel_ide; /* Selective Stream count (register block allocator) */ + struct ida ide_stream_ida; unsigned int ide_cfg:1; /* Config cycles over IDE */ unsigned int ide_tee_limit:1; /* Disallow T=0 traffic over IDE */ #endif @@ -614,6 +616,10 @@ struct pci_host_bridge { int domain_nr; struct list_head windows; /* resource_entry */ struct list_head dma_ranges; /* dma ranges resource list */ +#ifdef CONFIG_PCI_IDE + u16 nr_ide_streams; /* Max streams possibly active in @ide_stream_ida */ + struct ida ide_stream_ida; +#endif u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ int (*map_irq)(const struct pci_dev *, u8, u8); void (*release_fn)(struct pci_host_bridge *); -- cgit v1.2.3 From 9ddaf9c3ed007cd03c1335fb40920ad76f72a3d5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:29:00 -0700 Subject: PCI/IDE: Report available IDE streams The limited number of link-encryption (IDE) streams that a given set of host bridges supports is a platform specific detail. Provide pci_ide_init_nr_streams() as a generic facility for either platform TSM drivers, or PCI core native IDE, to report the number available streams. After invoking pci_ide_init_nr_streams() an "available_secure_streams" attribute appears in PCI host bridge sysfs to convey that count. Introduce a device-type, @pci_host_bridge_type, now that both a release method and sysfs attribute groups are being specified for all 'struct pci_host_bridge' instances. Cc: Bjorn Helgaas Cc: Lukas Wunner Cc: Samuel Ortiz Cc: Alexey Kardashevskiy Cc: Xu Yilun Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-9-dan.j.williams@intel.com Signed-off-by: Dan Williams --- .../ABI/testing/sysfs-devices-pci-host-bridge | 12 ++++ drivers/pci/ide.c | 67 ++++++++++++++++++++++ drivers/pci/pci.h | 1 + drivers/pci/probe.c | 14 ++++- include/linux/pci-ide.h | 1 + 5 files changed, 94 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge index 2c66e5bb2bf8..b91ec3450811 100644 --- a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge +++ b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge @@ -31,3 +31,15 @@ Description: platform specific pool of stream resources shared by the Root Ports in a host bridge. See /sys/devices/pciDDDD:BB entry for details about the DDDD:BB format. + +What: pciDDDD:BB/available_secure_streams +Contact: linux-pci@vger.kernel.org +Description: + (RO) When a host bridge has Root Ports that support PCIe IDE + (link encryption and integrity protection) there may be a + limited number of Selective IDE Streams that can be used for + establishing new end-to-end secure links. This attribute + decrements upon secure link setup, and increments upon secure + link teardown. The in-use stream count is determined by counting + stream symlinks. See /sys/devices/pciDDDD:BB entry for details + about the DDDD:BB format. diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c index 7643840738fe..4ae3872589fc 100644 --- a/drivers/pci/ide.c +++ b/drivers/pci/ide.c @@ -514,3 +514,70 @@ void pci_ide_init_host_bridge(struct pci_host_bridge *hb) hb->nr_ide_streams = 256; ida_init(&hb->ide_stream_ida); } + +static ssize_t available_secure_streams_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_host_bridge *hb = to_pci_host_bridge(dev); + int nr = READ_ONCE(hb->nr_ide_streams); + int avail = nr; + + if (!nr) + return -ENXIO; + + /* + * Yes, this is inefficient and racy, but it is only for occasional + * platform resource surveys. Worst case is bounded to 256 streams. + */ + for (int i = 0; i < nr; i++) + if (ida_exists(&hb->ide_stream_ida, i)) + avail--; + return sysfs_emit(buf, "%d\n", avail); +} +static DEVICE_ATTR_RO(available_secure_streams); + +static struct attribute *pci_ide_attrs[] = { + &dev_attr_available_secure_streams.attr, + NULL +}; + +static umode_t pci_ide_attr_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pci_host_bridge *hb = to_pci_host_bridge(dev); + + if (a == &dev_attr_available_secure_streams.attr) + if (!hb->nr_ide_streams) + return 0; + + return a->mode; +} + +const struct attribute_group pci_ide_attr_group = { + .attrs = pci_ide_attrs, + .is_visible = pci_ide_attr_visible, +}; + +/** + * pci_ide_set_nr_streams() - sets size of the pool of IDE Stream resources + * @hb: host bridge boundary for the stream pool + * @nr: number of streams + * + * Platform PCI init and/or expert test module use only. Limit IDE + * Stream establishment by setting the number of stream resources + * available at the host bridge. Platform init code must set this before + * the first pci_ide_stream_alloc() call if the platform has less than the + * default of 256 streams per host-bridge. + * + * The "PCI_IDE" symbol namespace is required because this is typically + * a detail that is settled in early PCI init. I.e. this export is not + * for endpoint drivers. + */ +void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr) +{ + hb->nr_ide_streams = min(nr, 256); + WARN_ON_ONCE(!ida_is_empty(&hb->ide_stream_ida)); + sysfs_update_group(&hb->dev.kobj, &pci_ide_attr_group); +} +EXPORT_SYMBOL_NS_GPL(pci_ide_set_nr_streams, "PCI_IDE"); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d3f16be40102..f6ffe5ee4717 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -616,6 +616,7 @@ static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { } #ifdef CONFIG_PCI_IDE void pci_ide_init(struct pci_dev *dev); void pci_ide_init_host_bridge(struct pci_host_bridge *hb); +extern const struct attribute_group pci_ide_attr_group; #else static inline void pci_ide_init(struct pci_dev *dev) { } static inline void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { } diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 93fa7ba8dfa6..cfacf5bcd073 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -653,6 +653,18 @@ static void pci_release_host_bridge_dev(struct device *dev) kfree(bridge); } +static const struct attribute_group *pci_host_bridge_groups[] = { +#ifdef CONFIG_PCI_IDE + &pci_ide_attr_group, +#endif + NULL +}; + +static const struct device_type pci_host_bridge_type = { + .groups = pci_host_bridge_groups, + .release = pci_release_host_bridge_dev, +}; + static void pci_init_host_bridge(struct pci_host_bridge *bridge) { INIT_LIST_HEAD(&bridge->windows); @@ -672,6 +684,7 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge) bridge->native_dpc = 1; bridge->domain_nr = PCI_DOMAIN_NR_NOT_SET; bridge->native_cxl_error = 1; + bridge->dev.type = &pci_host_bridge_type; pci_ide_init_host_bridge(bridge); device_initialize(&bridge->dev); @@ -686,7 +699,6 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv) return NULL; pci_init_host_bridge(bridge); - bridge->dev.release = pci_release_host_bridge_dev; return bridge; } diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index e638f9429bf9..85645b0a8620 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -63,6 +63,7 @@ struct pci_ide { const char *name; }; +void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr); struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, struct pci_ide *ide); struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev); -- cgit v1.2.3 From a4438f06b1db15ce3d831ce82b8767665638aa2a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:29:01 -0700 Subject: PCI/TSM: Report active IDE streams Given that the platform TSM owns IDE Stream ID allocation, report the active streams via the TSM class device. Establish a symlink from the class device to the PCI endpoint device consuming the stream, named by the Stream ID. Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Alexey Kardashevskiy Link: https://patch.msgid.link/20251031212902.2256310-10-dan.j.williams@intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-class-tsm | 10 ++++++++++ drivers/pci/ide.c | 4 ++++ drivers/virt/coco/tsm-core.c | 28 ++++++++++++++++++++++++++++ include/linux/pci-ide.h | 2 ++ include/linux/tsm.h | 3 +++ 5 files changed, 47 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-tsm b/Documentation/ABI/testing/sysfs-class-tsm index 2949468deaf7..6fc1a5ac6da1 100644 --- a/Documentation/ABI/testing/sysfs-class-tsm +++ b/Documentation/ABI/testing/sysfs-class-tsm @@ -7,3 +7,13 @@ Description: signals when the PCI layer is able to support establishment of link encryption and other device-security features coordinated through a platform tsm. + +What: /sys/class/tsm/tsmN/streamH.R.E +Contact: linux-pci@vger.kernel.org +Description: + (RO) When a host bridge has established a secure connection via + the platform TSM, symlink appears. The primary function of this + is have a system global review of TSM resource consumption + across host bridges. The link points to the endpoint PCI device + and matches the same link published by the host bridge. See + Documentation/ABI/testing/sysfs-devices-pci-host-bridge. diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c index 4ae3872589fc..da5b1acccbb4 100644 --- a/drivers/pci/ide.c +++ b/drivers/pci/ide.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "pci.h" @@ -261,6 +262,9 @@ void pci_ide_stream_release(struct pci_ide *ide) if (ide->partner[PCI_IDE_EP].enable) pci_ide_stream_disable(pdev, ide); + if (ide->tsm_dev) + tsm_ide_stream_unregister(ide); + if (ide->partner[PCI_IDE_RP].setup) pci_ide_stream_teardown(rp, ide); diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c index 0e705f3067a1..f027876a2f19 100644 --- a/drivers/virt/coco/tsm-core.c +++ b/drivers/virt/coco/tsm-core.c @@ -4,11 +4,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include #include #include +#include static struct class *tsm_class; static DECLARE_RWSEM(tsm_rwsem); @@ -106,6 +108,32 @@ void tsm_unregister(struct tsm_dev *tsm_dev) } EXPORT_SYMBOL_GPL(tsm_unregister); +/* must be invoked between tsm_register / tsm_unregister */ +int tsm_ide_stream_register(struct pci_ide *ide) +{ + struct pci_dev *pdev = ide->pdev; + struct pci_tsm *tsm = pdev->tsm; + struct tsm_dev *tsm_dev = tsm->tsm_dev; + int rc; + + rc = sysfs_create_link(&tsm_dev->dev.kobj, &pdev->dev.kobj, ide->name); + if (rc) + return rc; + + ide->tsm_dev = tsm_dev; + return 0; +} +EXPORT_SYMBOL_GPL(tsm_ide_stream_register); + +void tsm_ide_stream_unregister(struct pci_ide *ide) +{ + struct tsm_dev *tsm_dev = ide->tsm_dev; + + ide->tsm_dev = NULL; + sysfs_remove_link(&tsm_dev->dev.kobj, ide->name); +} +EXPORT_SYMBOL_GPL(tsm_ide_stream_unregister); + static void tsm_release(struct device *dev) { struct tsm_dev *tsm_dev = container_of(dev, typeof(*tsm_dev), dev); diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index 85645b0a8620..d0f10f3c89fc 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -50,6 +50,7 @@ struct pci_ide_partner { * @host_bridge_stream: allocated from host bridge @ide_stream_ida pool * @stream_id: unique Stream ID (within Partner Port pairing) * @name: name of the established Selective IDE Stream in sysfs + * @tsm_dev: For TSM established IDE, the TSM device context * * Negative @stream_id values indicate "uninitialized" on the * expectation that with TSM established IDE the TSM owns the stream_id @@ -61,6 +62,7 @@ struct pci_ide { u8 host_bridge_stream; int stream_id; const char *name; + struct tsm_dev *tsm_dev; }; void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr); diff --git a/include/linux/tsm.h b/include/linux/tsm.h index 22e05b2aac69..a3b7ab668eff 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -123,4 +123,7 @@ int tsm_report_unregister(const struct tsm_report_ops *ops); struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *ops); void tsm_unregister(struct tsm_dev *tsm_dev); struct tsm_dev *find_tsm_dev(int id); +struct pci_ide; +int tsm_ide_stream_register(struct pci_ide *ide); +void tsm_ide_stream_unregister(struct pci_ide *ide); #endif /* __TSM_H */ -- cgit v1.2.3 From e497310b4ffb559e1149ee89470d5c518d234ddf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:43:55 +0100 Subject: uaccess: Provide scoped user access regions User space access regions are tedious and require similar code patterns all over the place: if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(val, from, Efault); user_read_access_end(); return 0; Efault: user_read_access_end(); return -EFAULT; This got worse with the recent addition of masked user access, which optimizes the speculation prevention: if (can_do_masked_user_access()) from = masked_user_read_access_begin((from)); else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(val, from, Efault); user_read_access_end(); return 0; Efault: user_read_access_end(); return -EFAULT; There have been issues with using the wrong user_*_access_end() variant in the error path and other typical Copy&Pasta problems, e.g. using the wrong fault label in the user accessor which ends up using the wrong accesss end variant. These patterns beg for scopes with automatic cleanup. The resulting outcome is: scoped_user_read_access(from, Efault) unsafe_get_user(val, from, Efault); return 0; Efault: return -EFAULT; The scope guarantees the proper cleanup for the access mode is invoked both in the success and the failure (fault) path. The scoped_user_$MODE_access() macros are implemented as self terminating nested for() loops. Thanks to Andrew Cooper for pointing me at them. The scope can therefore be left with 'break', 'goto' and 'return'. Even 'continue' "works" due to the self termination mechanism. Both GCC and clang optimize all the convoluted macro maze out and the above results with clang in: b80: f3 0f 1e fa endbr64 b84: 48 b8 ef cd ab 89 67 45 23 01 movabs $0x123456789abcdef,%rax b8e: 48 39 c7 cmp %rax,%rdi b91: 48 0f 47 f8 cmova %rax,%rdi b95: 90 nop b96: 90 nop b97: 90 nop b98: 31 c9 xor %ecx,%ecx b9a: 8b 07 mov (%rdi),%eax b9c: 89 06 mov %eax,(%rsi) b9e: 85 c9 test %ecx,%ecx ba0: 0f 94 c0 sete %al ba3: 90 nop ba4: 90 nop ba5: 90 nop ba6: c3 ret Which looks as compact as it gets. The NOPs are placeholder for STAC/CLAC. GCC emits the fault path seperately: bf0: f3 0f 1e fa endbr64 bf4: 48 b8 ef cd ab 89 67 45 23 01 movabs $0x123456789abcdef,%rax bfe: 48 39 c7 cmp %rax,%rdi c01: 48 0f 47 f8 cmova %rax,%rdi c05: 90 nop c06: 90 nop c07: 90 nop c08: 31 d2 xor %edx,%edx c0a: 8b 07 mov (%rdi),%eax c0c: 89 06 mov %eax,(%rsi) c0e: 85 d2 test %edx,%edx c10: 75 09 jne c1b c12: 90 nop c13: 90 nop c14: 90 nop c15: b8 01 00 00 00 mov $0x1,%eax c1a: c3 ret c1b: 90 nop c1c: 90 nop c1d: 90 nop c1e: 31 c0 xor %eax,%eax c20: c3 ret The fault labels for the scoped*() macros and the fault labels for the actual user space accessors can be shared and must be placed outside of the scope. If masked user access is enabled on an architecture, then the pointer handed in to scoped_user_$MODE_access() can be modified to point to a guaranteed faulting user address. This modification is only scope local as the pointer is aliased inside the scope. When the scope is left the alias is not longer in effect. IOW the original pointer value is preserved so it can be used e.g. for fixup or diagnostic purposes in the fault path. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027083745.546420421@linutronix.de --- include/linux/uaccess.h | 192 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 8aa82b1d6013..5f142c05b0dc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,6 +2,7 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ +#include #include #include #include @@ -35,9 +36,17 @@ #ifdef masked_user_access_begin #define can_do_masked_user_access() 1 +# ifndef masked_user_write_access_begin +# define masked_user_write_access_begin masked_user_access_begin +# endif +# ifndef masked_user_read_access_begin +# define masked_user_read_access_begin masked_user_access_begin +#endif #else #define can_do_masked_user_access() 0 #define masked_user_access_begin(src) NULL + #define masked_user_read_access_begin(src) NULL + #define masked_user_write_access_begin(src) NULL #define mask_user_address(src) (src) #endif @@ -633,6 +642,189 @@ static inline void user_access_restore(unsigned long flags) { } #define user_read_access_end user_access_end #endif +/* Define RW variant so the below _mode macro expansion works */ +#define masked_user_rw_access_begin(u) masked_user_access_begin(u) +#define user_rw_access_begin(u, s) user_access_begin(u, s) +#define user_rw_access_end() user_access_end() + +/* Scoped user access */ +#define USER_ACCESS_GUARD(_mode) \ +static __always_inline void __user * \ +class_user_##_mode##_begin(void __user *ptr) \ +{ \ + return ptr; \ +} \ + \ +static __always_inline void \ +class_user_##_mode##_end(void __user *ptr) \ +{ \ + user_##_mode##_access_end(); \ +} \ + \ +DEFINE_CLASS(user_ ##_mode## _access, void __user *, \ + class_user_##_mode##_end(_T), \ + class_user_##_mode##_begin(ptr), void __user *ptr) \ + \ +static __always_inline class_user_##_mode##_access_t \ +class_user_##_mode##_access_ptr(void __user *scope) \ +{ \ + return scope; \ +} + +USER_ACCESS_GUARD(read) +USER_ACCESS_GUARD(write) +USER_ACCESS_GUARD(rw) +#undef USER_ACCESS_GUARD + +/** + * __scoped_user_access_begin - Start a scoped user access + * @mode: The mode of the access class (read, write, rw) + * @uptr: The pointer to access user space memory + * @size: Size of the access + * @elbl: Error label to goto when the access region is rejected + * + * Internal helper for __scoped_user_access(). Don't use directly. + */ +#define __scoped_user_access_begin(mode, uptr, size, elbl) \ +({ \ + typeof(uptr) __retptr; \ + \ + if (can_do_masked_user_access()) { \ + __retptr = masked_user_##mode##_access_begin(uptr); \ + } else { \ + __retptr = uptr; \ + if (!user_##mode##_access_begin(uptr, size)) \ + goto elbl; \ + } \ + __retptr; \ +}) + +/** + * __scoped_user_access - Open a scope for user access + * @mode: The mode of the access class (read, write, rw) + * @uptr: The pointer to access user space memory + * @size: Size of the access + * @elbl: Error label to goto when the access region is rejected. It + * must be placed outside the scope + * + * If the user access function inside the scope requires a fault label, it + * can use @elbl or a different label outside the scope, which requires + * that user access which is implemented with ASM GOTO has been properly + * wrapped. See unsafe_get_user() for reference. + * + * scoped_user_rw_access(ptr, efault) { + * unsafe_get_user(rval, &ptr->rval, efault); + * unsafe_put_user(wval, &ptr->wval, efault); + * } + * return 0; + * efault: + * return -EFAULT; + * + * The scope is internally implemented as a autoterminating nested for() + * loop, which can be left with 'return', 'break' and 'goto' at any + * point. + * + * When the scope is left user_##@_mode##_access_end() is automatically + * invoked. + * + * When the architecture supports masked user access and the access region + * which is determined by @uptr and @size is not a valid user space + * address, i.e. < TASK_SIZE, the scope sets the pointer to a faulting user + * space address and does not terminate early. This optimizes for the good + * case and lets the performance uncritical bad case go through the fault. + * + * The eventual modification of the pointer is limited to the scope. + * Outside of the scope the original pointer value is unmodified, so that + * the original pointer value is available for diagnostic purposes in an + * out of scope fault path. + * + * Nesting scoped user access into a user access scope is invalid and fails + * the build. Nesting into other guards, e.g. pagefault is safe. + * + * The masked variant does not check the size of the access and relies on a + * mapping hole (e.g. guard page) to catch an out of range pointer, the + * first access to user memory inside the scope has to be within + * @uptr ... @uptr + PAGE_SIZE - 1 + * + * Don't use directly. Use scoped_masked_user_$MODE_access() instead. + */ +#define __scoped_user_access(mode, uptr, size, elbl) \ +for (bool done = false; !done; done = true) \ + for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ + !done; done = true) \ + for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true) \ + /* Force modified pointer usage within the scope */ \ + for (const typeof(uptr) uptr = _tmpptr; !done; done = true) + +/** + * scoped_user_read_access_size - Start a scoped user read access with given size + * @usrc: Pointer to the user space address to read from + * @size: Size of the access starting from @usrc + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_read_access_size(usrc, size, elbl) \ + __scoped_user_access(read, usrc, size, elbl) + +/** + * scoped_user_read_access - Start a scoped user read access + * @usrc: Pointer to the user space address to read from + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @usrc is determined via sizeof(*@usrc)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_read_access(usrc, elbl) \ + scoped_user_read_access_size(usrc, sizeof(*(usrc)), elbl) + +/** + * scoped_user_write_access_size - Start a scoped user write access with given size + * @udst: Pointer to the user space address to write to + * @size: Size of the access starting from @udst + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_write_access_size(udst, size, elbl) \ + __scoped_user_access(write, udst, size, elbl) + +/** + * scoped_user_write_access - Start a scoped user write access + * @udst: Pointer to the user space address to write to + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @udst is determined via sizeof(*@udst)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_write_access(udst, elbl) \ + scoped_user_write_access_size(udst, sizeof(*(udst)), elbl) + +/** + * scoped_user_rw_access_size - Start a scoped user read/write access with given size + * @uptr Pointer to the user space address to read from and write to + * @size: Size of the access starting from @uptr + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_rw_access_size(uptr, size, elbl) \ + __scoped_user_access(rw, uptr, size, elbl) + +/** + * scoped_user_rw_access - Start a scoped user read/write access + * @uptr Pointer to the user space address to read from and write to + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @uptr is determined via sizeof(*@uptr)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_rw_access(uptr, elbl) \ + scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl) + #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, -- cgit v1.2.3 From b2cfc0cd68b830dde80fce2406580e258a1e976d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:43:56 +0100 Subject: uaccess: Provide put/get_user_inline() Provide convenience wrappers around scoped user access similar to put/get_user(), which reduce the usage sites to: if (!get_user_inline(val, ptr)) return -EFAULT; Should only be used if there is a demonstrable performance benefit. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027083745.609031602@linutronix.de --- include/linux/uaccess.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 5f142c05b0dc..be395f5f7ee3 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -825,6 +825,56 @@ for (bool done = false; !done; done = true) \ #define scoped_user_rw_access(uptr, elbl) \ scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl) +/** + * get_user_inline - Read user data inlined + * @val: The variable to store the value read from user memory + * @usrc: Pointer to the user space memory to read from + * + * Return: 0 if successful, -EFAULT when faulted + * + * Inlined variant of get_user(). Only use when there is a demonstrable + * performance reason. + */ +#define get_user_inline(val, usrc) \ +({ \ + __label__ efault; \ + typeof(usrc) _tmpsrc = usrc; \ + int _ret = 0; \ + \ + scoped_user_read_access(_tmpsrc, efault) \ + unsafe_get_user(val, _tmpsrc, efault); \ + if (0) { \ + efault: \ + _ret = -EFAULT; \ + } \ + _ret; \ +}) + +/** + * put_user_inline - Write to user memory inlined + * @val: The value to write + * @udst: Pointer to the user space memory to write to + * + * Return: 0 if successful, -EFAULT when faulted + * + * Inlined variant of put_user(). Only use when there is a demonstrable + * performance reason. + */ +#define put_user_inline(val, udst) \ +({ \ + __label__ efault; \ + typeof(udst) _tmpdst = udst; \ + int _ret = 0; \ + \ + scoped_user_write_access(_tmpdst, efault) \ + unsafe_put_user(val, _tmpdst, efault); \ + if (0) { \ + efault: \ + _ret = -EFAULT; \ + } \ + _ret; \ +}) + #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, -- cgit v1.2.3 From 3ca59da7aa5c7f569b04a511dc8670861d58b509 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:16 +0100 Subject: rseq: Avoid pointless evaluation in __rseq_notify_resume() The RSEQ critical section mechanism only clears the event mask when a critical section is registered, otherwise it is stale and collects bits. That means once a critical section is installed the first invocation of that code when TIF_NOTIFY_RESUME is set will abort the critical section, even when the TIF bit was not raised by the rseq preempt/migrate/signal helpers. This also has a performance implication because TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is utilized by quite some infrastructure. That means every invocation of __rseq_notify_resume() goes unconditionally through the heavy lifting of user space access and consistency checks even if there is no reason to do so. Keeping the stale event mask around when exiting to user space also prevents it from being utilized by the upcoming time slice extension mechanism. Avoid this by reading and clearing the event mask before doing the user space critical section access with interrupts or preemption disabled, which ensures that the read and clear operation is CPU local atomic versus scheduling and the membarrier IPI. This is correct as after re-enabling interrupts/preemption any relevant event will set the bit again and raise TIF_NOTIFY_RESUME, which makes the user space exit code take another round of TIF bit clearing. If the event mask was non-zero, invoke the slow path. On debug kernels the slow path is invoked unconditionally and the result of the event mask evaluation is handed in. Add a exit path check after the TIF bit loop, which validates on debug kernels that the event mask is zero before exiting to user space. While at it reword the convoluted comment why the pt_regs pointer can be NULL under certain circumstances. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.022571576@linutronix.de --- include/linux/irq-entry-common.h | 7 +++-- include/linux/rseq.h | 10 +++++- kernel/rseq.c | 66 ++++++++++++++++++++++++++-------------- 3 files changed, 58 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d643c7c87822..e5941df13901 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -2,11 +2,12 @@ #ifndef __LINUX_IRQENTRYCOMMON_H #define __LINUX_IRQENTRYCOMMON_H +#include +#include +#include #include #include -#include #include -#include #include #include @@ -226,6 +227,8 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) arch_exit_to_user_mode_prepare(regs, ti_work); + rseq_exit_to_user_mode(); + /* Ensure that kernel state is sane for a return to userspace */ kmap_assert_nomap(); lockdep_assert_irqs_disabled(); diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 69553e7c14c1..7622b733a508 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -66,6 +66,14 @@ static inline void rseq_migrate(struct task_struct *t) rseq_set_notify_resume(t); } +static __always_inline void rseq_exit_to_user_mode(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { + if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask)) + current->rseq_event_mask = 0; + } +} + /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. @@ -118,7 +126,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) static inline void rseq_execve(struct task_struct *t) { } - +static inline void rseq_exit_to_user_mode(void) { } #endif #ifdef CONFIG_DEBUG_RSEQ diff --git a/kernel/rseq.c b/kernel/rseq.c index 2452b7366b00..246319d7cb0c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -324,9 +324,9 @@ static bool rseq_warn_flags(const char *str, u32 flags) return true; } -static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +static int rseq_check_flags(struct task_struct *t, u32 cs_flags) { - u32 flags, event_mask; + u32 flags; int ret; if (rseq_warn_flags("rseq_cs", cs_flags)) @@ -339,17 +339,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) if (rseq_warn_flags("rseq", flags)) return -EINVAL; - - /* - * Load and clear event mask atomically with respect to - * scheduler preemption and membarrier IPIs. - */ - scoped_guard(RSEQ_EVENT_GUARD) { - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - } - - return !!event_mask; + return 0; } static int clear_rseq_cs(struct rseq __user *rseq) @@ -380,7 +370,7 @@ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; } -static int rseq_ip_fixup(struct pt_regs *regs) +static int rseq_ip_fixup(struct pt_regs *regs, bool abort) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; @@ -398,9 +388,11 @@ static int rseq_ip_fixup(struct pt_regs *regs) */ if (!in_rseq_cs(ip, &rseq_cs)) return clear_rseq_cs(t->rseq); - ret = rseq_need_restart(t, rseq_cs.flags); - if (ret <= 0) + ret = rseq_check_flags(t, rseq_cs.flags); + if (ret < 0) return ret; + if (!abort) + return 0; ret = clear_rseq_cs(t->rseq); if (ret) return ret; @@ -430,14 +422,44 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) return; /* - * regs is NULL if and only if the caller is in a syscall path. Skip - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and - * kill a misbehaving userspace on debug kernels. + * If invoked from hypervisors or IO-URING, then @regs is a NULL + * pointer, so fixup cannot be done. If the syscall which led to + * this invocation was invoked inside a critical section, then it + * will either end up in this code again or a possible violation of + * a syscall inside a critical region can only be detected by the + * debug code in rseq_syscall() in a debug enabled kernel. */ if (regs) { - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + /* + * Read and clear the event mask first. If the task was not + * preempted or migrated or a signal is on the way, there + * is no point in doing any of the heavy lifting here on + * production kernels. In that case TIF_NOTIFY_RESUME was + * raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. + */ + u32 event_mask; + + scoped_guard(RSEQ_EVENT_GUARD) { + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + } + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) { + ret = rseq_ip_fixup(regs, !!event_mask); + if (unlikely(ret < 0)) + goto error; + } } if (unlikely(rseq_update_cpu_node_id(t))) goto error; -- cgit v1.2.3 From fdc0f39d289ebcf46ef44f43460207ef24c94ed7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:18 +0100 Subject: rseq: Condense the inline stubs Scrolling over tons of pointless { } lines to find the actual code is annoying at best. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.085971048@linutronix.de --- include/linux/rseq.h | 47 ++++++++++++----------------------------------- 1 file changed, 12 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7622b733a508..21f875af0e96 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -101,44 +101,21 @@ static inline void rseq_execve(struct task_struct *t) t->rseq_event_mask = 0; } -#else - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ -} -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_preempt(struct task_struct *t) -{ -} -static inline void rseq_migrate(struct task_struct *t) -{ -} -static inline void rseq_fork(struct task_struct *t, u64 clone_flags) -{ -} -static inline void rseq_execve(struct task_struct *t) -{ -} +#else /* CONFIG_RSEQ */ +static inline void rseq_set_notify_resume(struct task_struct *t) { } +static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_preempt(struct task_struct *t) { } +static inline void rseq_migrate(struct task_struct *t) { } +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } +static inline void rseq_execve(struct task_struct *t) { } static inline void rseq_exit_to_user_mode(void) { } -#endif +#endif /* !CONFIG_RSEQ */ #ifdef CONFIG_DEBUG_RSEQ - void rseq_syscall(struct pt_regs *regs); - -#else - -static inline void rseq_syscall(struct pt_regs *regs) -{ -} - -#endif +#else /* CONFIG_DEBUG_RSEQ */ +static inline void rseq_syscall(struct pt_regs *regs) { } +#endif /* !CONFIG_DEBUG_RSEQ */ #endif /* _LINUX_RSEQ_H */ -- cgit v1.2.3 From 41b43a6ba3848be8ceec77b8b2a56ddeca6167ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:22 +0100 Subject: rseq: Remove the ksig argument from rseq_handle_notify_resume() There is no point for this being visible in the resume_to_user_mode() handling. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.211520245@linutronix.de --- include/linux/resume_user_mode.h | 2 +- include/linux/rseq.h | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index e0135e0adae0..dd3bf7da90a8 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); - rseq_handle_notify_resume(NULL, regs); + rseq_handle_notify_resume(regs); } #endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 21f875af0e96..d72ddf7ce903 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -37,19 +37,20 @@ static inline void rseq_set_notify_resume(struct task_struct *t) void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) +static inline void rseq_handle_notify_resume(struct pt_regs *regs) { if (current->rseq) - __rseq_handle_notify_resume(ksig, regs); + __rseq_handle_notify_resume(NULL, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - scoped_guard(RSEQ_EVENT_GUARD) - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - rseq_handle_notify_resume(ksig, regs); + if (current->rseq) { + scoped_guard(RSEQ_EVENT_GUARD) + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + __rseq_handle_notify_resume(ksig, regs); + } } /* rseq_preempt() requires preemption to be disabled. */ @@ -103,7 +104,7 @@ static inline void rseq_execve(struct task_struct *t) #else /* CONFIG_RSEQ */ static inline void rseq_set_notify_resume(struct task_struct *t) { } -static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_preempt(struct task_struct *t) { } static inline void rseq_migrate(struct task_struct *t) { } -- cgit v1.2.3 From d923739e2e356424cc566143a3323c62cd6ed067 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:26 +0100 Subject: rseq: Simplify the event notification Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_* flags") the bits in task::rseq_event_mask are meaningless and just extra work in terms of setting them individually. Aside of that the only relevant point where an event has to be raised is context switch. Neither the CPU nor MM CID can change without going through a context switch. Collapse them all into a single boolean which simplifies the code a lot and remove the pointless invocations which have been sprinkled all over the place for no value. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.336978188@linutronix.de --- fs/exec.c | 2 +- include/linux/rseq.h | 66 ++++++++++------------------------------------- include/linux/sched.h | 10 +++---- include/uapi/linux/rseq.h | 21 +++++---------- kernel/rseq.c | 28 ++++++++++++-------- kernel/sched/core.c | 5 +--- kernel/sched/membarrier.c | 8 +++--- 7 files changed, 48 insertions(+), 92 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 4298e7e08d5d..e45b29890269 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1775,7 +1775,7 @@ out: force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); - rseq_set_notify_resume(current); + rseq_sched_switch_event(current); current->in_execve = 0; return retval; diff --git a/include/linux/rseq.h b/include/linux/rseq.h index d72ddf7ce903..241067bf20db 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -3,38 +3,8 @@ #define _LINUX_RSEQ_H #ifdef CONFIG_RSEQ - -#include #include -#ifdef CONFIG_MEMBARRIER -# define RSEQ_EVENT_GUARD irq -#else -# define RSEQ_EVENT_GUARD preempt -#endif - -/* - * Map the event mask on the user-space ABI enum rseq_cs_flags - * for direct mask checks. - */ -enum rseq_event_mask_bits { - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, -}; - -enum rseq_event_mask { - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), -}; - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ - if (t->rseq) - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); -} - void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) @@ -43,35 +13,27 @@ static inline void rseq_handle_notify_resume(struct pt_regs *regs) __rseq_handle_notify_resume(NULL, regs); } -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { if (current->rseq) { - scoped_guard(RSEQ_EVENT_GUARD) - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + current->rseq_event_pending = true; __rseq_handle_notify_resume(ksig, regs); } } -/* rseq_preempt() requires preemption to be disabled. */ -static inline void rseq_preempt(struct task_struct *t) +static inline void rseq_sched_switch_event(struct task_struct *t) { - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); -} - -/* rseq_migrate() requires preemption to be disabled. */ -static inline void rseq_migrate(struct task_struct *t) -{ - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); + if (t->rseq) { + t->rseq_event_pending = true; + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + } } static __always_inline void rseq_exit_to_user_mode(void) { if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask)) - current->rseq_event_mask = 0; + if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending)) + current->rseq_event_pending = false; } } @@ -85,12 +47,12 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; - t->rseq_event_mask = 0; + t->rseq_event_pending = false; } else { t->rseq = current->rseq; t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; - t->rseq_event_mask = current->rseq_event_mask; + t->rseq_event_pending = current->rseq_event_pending; } } @@ -99,15 +61,13 @@ static inline void rseq_execve(struct task_struct *t) t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; - t->rseq_event_mask = 0; + t->rseq_event_pending = false; } #else /* CONFIG_RSEQ */ -static inline void rseq_set_notify_resume(struct task_struct *t) { } static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } -static inline void rseq_preempt(struct task_struct *t) { } -static inline void rseq_migrate(struct task_struct *t) { } +static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } static inline void rseq_exit_to_user_mode(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index b469878de25c..6627c527c2c7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1407,14 +1407,14 @@ struct task_struct { #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ - struct rseq __user *rseq; - u32 rseq_len; - u32 rseq_sig; + struct rseq __user *rseq; + u32 rseq_len; + u32 rseq_sig; /* - * RmW on rseq_event_mask must be performed atomically + * RmW on rseq_event_pending must be performed atomically * with respect to preemption. */ - unsigned long rseq_event_mask; + bool rseq_event_pending; # ifdef CONFIG_DEBUG_RSEQ /* * This is a place holder to save a copy of the rseq fields for diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index c233aae5eac9..1b76d508400c 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -114,20 +114,13 @@ struct rseq { /* * Restartable sequences flags field. * - * This field should only be updated by the thread which - * registered this data structure. Read by the kernel. - * Mainly used for single-stepping through rseq critical sections - * with debuggers. - * - * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT - * Inhibit instruction sequence block restart on preemption - * for this thread. - * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL - * Inhibit instruction sequence block restart on signal - * delivery for this thread. - * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE - * Inhibit instruction sequence block restart on migration for - * this thread. + * This field was initially intended to allow event masking for + * single-stepping through rseq critical sections with debuggers. + * The kernel does not support this anymore and the relevant bits + * are checked for being always false: + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE */ __u32 flags; diff --git a/kernel/rseq.c b/kernel/rseq.c index 80af48a972f0..59adc1a7183b 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -78,6 +78,12 @@ #define CREATE_TRACE_POINTS #include +#ifdef CONFIG_MEMBARRIER +# define RSEQ_EVENT_GUARD irq +#else +# define RSEQ_EVENT_GUARD preempt +#endif + /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 @@ -430,11 +436,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) */ if (regs) { /* - * Read and clear the event mask first. If the task was not - * preempted or migrated or a signal is on the way, there - * is no point in doing any of the heavy lifting here on - * production kernels. In that case TIF_NOTIFY_RESUME was - * raised by some other functionality. + * Read and clear the event pending bit first. If the task + * was not preempted or migrated or a signal is on the way, + * there is no point in doing any of the heavy lifting here + * on production kernels. In that case TIF_NOTIFY_RESUME + * was raised by some other functionality. * * This is correct because the read/clear operation is * guarded against scheduler preemption, which makes it CPU @@ -447,15 +453,15 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) * with the result handed in to allow the detection of * inconsistencies. */ - u32 event_mask; + bool event; scoped_guard(RSEQ_EVENT_GUARD) { - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; + event = t->rseq_event_pending; + t->rseq_event_pending = false; } - if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) { - ret = rseq_ip_fixup(regs, !!event_mask); + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) { + ret = rseq_ip_fixup(regs, event); if (unlikely(ret < 0)) goto error; } @@ -584,7 +590,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ - rseq_set_notify_resume(current); + rseq_sched_switch_event(current); return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ebf67b48e2..b75e8e1eca4a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3329,7 +3329,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - rseq_migrate(p); sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } @@ -4763,7 +4762,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) p->sched_task_group = tg; } #endif - rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -4827,7 +4825,6 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); - rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -5121,7 +5118,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); - rseq_preempt(prev); + rseq_sched_switch_event(prev); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 62fba83b7bb1..623445603725 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -199,7 +199,7 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_preempt(current); + rseq_sched_switch_event(current); } static void ipi_sync_rq_state(void *info) @@ -407,9 +407,9 @@ static int membarrier_private_expedited(int flags, int cpu_id) * membarrier, we will end up with some thread in the mm * running without a core sync. * - * For RSEQ, don't rseq_preempt() the caller. User code - * is not supposed to issue syscalls at all from inside an - * rseq critical section. + * For RSEQ, don't invoke rseq_sched_switch_event() on the + * caller. User code is not supposed to issue syscalls at + * all from inside an rseq critical section. */ if (flags != MEMBARRIER_FLAG_SYNC_CORE) { preempt_disable(); -- cgit v1.2.3 From 83409986f49f17b14a675f9c598ad50d4c60191b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:28 +0100 Subject: rseq, virt: Retrigger RSEQ after vcpu_run() Hypervisors invoke resume_user_mode_work() before entering the guest, which clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user space context available to them, so the rseq notify handler skips inspecting the critical section, but updates the CPU/MM CID values unconditionally so that the eventual pending rseq event is not lost on the way to user space. This is a pointless exercise as the task might be rescheduled before actually returning to user space and it creates unnecessary work in the vcpu_run() loops. It's way more efficient to ignore that invocation based on @regs == NULL and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the vcpu_run() loop before returning from the ioctl(). This ensures that a pending RSEQ update is not lost and the IDs are updated before returning to user space. Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into a NOOP. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Acked-by: Sean Christopherson Link: https://patch.msgid.link/20251027084306.399495855@linutronix.de --- drivers/hv/mshv_root_main.c | 3 ++ include/linux/rseq.h | 17 ++++++++++ kernel/rseq.c | 78 ++++++++++++++++++++++++--------------------- virt/kvm/kvm_main.c | 7 ++++ 4 files changed, 68 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e3b2bd417c46..a21a0eb0f5be 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "mshv_eventfd.h" #include "mshv.h" @@ -560,6 +561,8 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) } } while (!vp->run.flags.intercept_suspend); + rseq_virt_userspace_exit(); + return ret; } diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 241067bf20db..c6267f70c746 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -37,6 +37,22 @@ static __always_inline void rseq_exit_to_user_mode(void) } } +/* + * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, + * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in + * that case just to do it eventually again before returning to user space, + * the entry resume_user_mode_work() invocation is ignored as the register + * argument is NULL. + * + * After returning from guest mode, they have to invoke this function to + * re-raise TIF_NOTIFY_RESUME if necessary. + */ +static inline void rseq_virt_userspace_exit(void) +{ + if (current->rseq_event_pending) + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); +} + /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. @@ -68,6 +84,7 @@ static inline void rseq_execve(struct task_struct *t) static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } +static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } static inline void rseq_exit_to_user_mode(void) { } diff --git a/kernel/rseq.c b/kernel/rseq.c index 59adc1a7183b..01e711383e05 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -422,50 +422,54 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; int ret, sig; + bool event; + + /* + * If invoked from hypervisors before entering the guest via + * resume_user_mode_work(), then @regs is a NULL pointer. + * + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises + * it before returning from the ioctl() to user space when + * rseq_event.sched_switch is set. + * + * So it's safe to ignore here instead of pointlessly updating it + * in the vcpu_run() loop. + */ + if (!regs) + return; if (unlikely(t->flags & PF_EXITING)) return; /* - * If invoked from hypervisors or IO-URING, then @regs is a NULL - * pointer, so fixup cannot be done. If the syscall which led to - * this invocation was invoked inside a critical section, then it - * will either end up in this code again or a possible violation of - * a syscall inside a critical region can only be detected by the - * debug code in rseq_syscall() in a debug enabled kernel. + * Read and clear the event pending bit first. If the task + * was not preempted or migrated or a signal is on the way, + * there is no point in doing any of the heavy lifting here + * on production kernels. In that case TIF_NOTIFY_RESUME + * was raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. */ - if (regs) { - /* - * Read and clear the event pending bit first. If the task - * was not preempted or migrated or a signal is on the way, - * there is no point in doing any of the heavy lifting here - * on production kernels. In that case TIF_NOTIFY_RESUME - * was raised by some other functionality. - * - * This is correct because the read/clear operation is - * guarded against scheduler preemption, which makes it CPU - * local atomic. If the task is preempted right after - * re-enabling preemption then TIF_NOTIFY_RESUME is set - * again and this function is invoked another time _before_ - * the task is able to return to user mode. - * - * On a debug kernel, invoke the fixup code unconditionally - * with the result handed in to allow the detection of - * inconsistencies. - */ - bool event; - - scoped_guard(RSEQ_EVENT_GUARD) { - event = t->rseq_event_pending; - t->rseq_event_pending = false; - } - - if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) { - ret = rseq_ip_fixup(regs, event); - if (unlikely(ret < 0)) - goto error; - } + scoped_guard(RSEQ_EVENT_GUARD) { + event = t->rseq_event_pending; + t->rseq_event_pending = false; } + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) { + ret = rseq_ip_fixup(regs, event); + if (unlikely(ret < 0)) + goto error; + } + if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..4255fcf9c6e5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -4476,6 +4477,12 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_run(vcpu); vcpu->wants_to_run = false; + /* + * FIXME: Remove this hack once all KVM architectures + * support the generic TIF bits, i.e. a dedicated TIF_RSEQ. + */ + rseq_virt_userspace_exit(); + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } -- cgit v1.2.3 From faba9d250eaec7afa248bba71531a08ccc497aab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:33 +0100 Subject: rseq: Introduce struct rseq_data In preparation for a major rewrite of this code, provide a data structure for rseq management. Put all the rseq related data into it (except for the debug part), which allows to simplify fork/execve by using memset() and memcpy() instead of adding new fields to initialize over and over. Create a storage struct for event management as well and put the sched_switch event and a indicator for RSEQ on a task into it as a start. That uses a union, which allows to mask and clear the whole lot efficiently. The indicators are explicitly not a bit field. Bit fields generate abysmal code. The boolean members are defined as u8 as that actually guarantees that it fits. There seem to be strange architecture ABIs which need more than 8 bits for a boolean. The has_rseq member is redundant vs. task::rseq, but it turns out that boolean operations and quick checks on the union generate better code than fiddling with separate entities and data types. This struct will be extended over time to carry more information. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.527086690@linutronix.de --- include/linux/rseq.h | 48 ++++++++++++++++------------------- include/linux/rseq_types.h | 51 +++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 14 +++-------- kernel/ptrace.c | 6 ++--- kernel/rseq.c | 63 +++++++++++++++++++++++----------------------- 5 files changed, 110 insertions(+), 72 deletions(-) create mode 100644 include/linux/rseq_types.h (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index c6267f70c746..ab91b1e6bb4a 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -9,22 +9,22 @@ void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) { - if (current->rseq) + if (current->rseq.event.has_rseq) __rseq_handle_notify_resume(NULL, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (current->rseq) { - current->rseq_event_pending = true; + if (current->rseq.event.has_rseq) { + current->rseq.event.sched_switch = true; __rseq_handle_notify_resume(ksig, regs); } } static inline void rseq_sched_switch_event(struct task_struct *t) { - if (t->rseq) { - t->rseq_event_pending = true; + if (t->rseq.event.has_rseq) { + t->rseq.event.sched_switch = true; set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } } @@ -32,8 +32,9 @@ static inline void rseq_sched_switch_event(struct task_struct *t) static __always_inline void rseq_exit_to_user_mode(void) { if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending)) - current->rseq_event_pending = false; + if (WARN_ON_ONCE(current->rseq.event.has_rseq && + current->rseq.event.events)) + current->rseq.event.events = 0; } } @@ -49,35 +50,30 @@ static __always_inline void rseq_exit_to_user_mode(void) */ static inline void rseq_virt_userspace_exit(void) { - if (current->rseq_event_pending) + if (current->rseq.event.sched_switch) set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } +static inline void rseq_reset(struct task_struct *t) +{ + memset(&t->rseq, 0, sizeof(t->rseq)); +} + +static inline void rseq_execve(struct task_struct *t) +{ + rseq_reset(t); +} + /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { - if (clone_flags & CLONE_VM) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_pending = false; - } else { + if (clone_flags & CLONE_VM) + rseq_reset(t); + else t->rseq = current->rseq; - t->rseq_len = current->rseq_len; - t->rseq_sig = current->rseq_sig; - t->rseq_event_pending = current->rseq_event_pending; - } -} - -static inline void rseq_execve(struct task_struct *t) -{ - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_pending = false; } #else /* CONFIG_RSEQ */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h new file mode 100644 index 000000000000..f7a60c8eddc9 --- /dev/null +++ b/include/linux/rseq_types.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RSEQ_TYPES_H +#define _LINUX_RSEQ_TYPES_H + +#include + +#ifdef CONFIG_RSEQ +struct rseq; + +/** + * struct rseq_event - Storage for rseq related event management + * @all: Compound to initialize and clear the data efficiently + * @events: Compound to access events with a single load/store + * @sched_switch: True if the task was scheduled out + * @has_rseq: True if the task has a rseq pointer installed + */ +struct rseq_event { + union { + u32 all; + struct { + union { + u16 events; + struct { + u8 sched_switch; + }; + }; + + u8 has_rseq; + }; + }; +}; + +/** + * struct rseq_data - Storage for all rseq related data + * @usrptr: Pointer to the registered user space RSEQ memory + * @len: Length of the RSEQ region + * @sig: Signature of critial section abort IPs + * @event: Storage for event management + */ +struct rseq_data { + struct rseq __user *usrptr; + u32 len; + u32 sig; + struct rseq_event event; +}; + +#else /* CONFIG_RSEQ */ +struct rseq_data { }; +#endif /* !CONFIG_RSEQ */ + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 6627c527c2c7..15627769409d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -1406,16 +1407,8 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_RSEQ - struct rseq __user *rseq; - u32 rseq_len; - u32 rseq_sig; - /* - * RmW on rseq_event_pending must be performed atomically - * with respect to preemption. - */ - bool rseq_event_pending; -# ifdef CONFIG_DEBUG_RSEQ + struct rseq_data rseq; +#ifdef CONFIG_DEBUG_RSEQ /* * This is a place holder to save a copy of the rseq fields for * validation of read-only fields. The struct rseq has a @@ -1423,7 +1416,6 @@ struct task_struct { * directly. Reserve a size large enough for the known fields. */ char rseq_fields[sizeof(struct rseq)]; -# endif #endif #ifdef CONFIG_SCHED_MM_CID diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 75a84efad40f..392ec2f75f01 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, unsigned long size, void __user *data) { struct ptrace_rseq_configuration conf = { - .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, - .rseq_abi_size = task->rseq_len, - .signature = task->rseq_sig, + .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr, + .rseq_abi_size = task->rseq.len, + .signature = task->rseq.sig, .flags = 0, }; diff --git a/kernel/rseq.c b/kernel/rseq.c index 81dddafa2f2e..aae62661e6bb 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -103,13 +103,13 @@ static int rseq_validate_ro_fields(struct task_struct *t) DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); u32 cpu_id_start, cpu_id, node_id, mm_cid; - struct rseq __user *rseq = t->rseq; + struct rseq __user *rseq = t->rseq.usrptr; /* * Validate fields which are required to be read-only by * user-space. */ - if (!user_read_access_begin(rseq, t->rseq_len)) + if (!user_read_access_begin(rseq, t->rseq.len)) goto efault; unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); @@ -147,10 +147,10 @@ efault: * Update an rseq field and its in-kernel copy in lock-step to keep a coherent * state. */ -#define rseq_unsafe_put_user(t, value, field, error_label) \ - do { \ - unsafe_put_user(value, &t->rseq->field, error_label); \ - rseq_kernel_fields(t)->field = value; \ +#define rseq_unsafe_put_user(t, value, field, error_label) \ + do { \ + unsafe_put_user(value, &t->rseq.usrptr->field, error_label); \ + rseq_kernel_fields(t)->field = value; \ } while (0) #else @@ -160,12 +160,12 @@ static int rseq_validate_ro_fields(struct task_struct *t) } #define rseq_unsafe_put_user(t, value, field, error_label) \ - unsafe_put_user(value, &t->rseq->field, error_label) + unsafe_put_user(value, &t->rseq.usrptr->field, error_label) #endif static int rseq_update_cpu_node_id(struct task_struct *t) { - struct rseq __user *rseq = t->rseq; + struct rseq __user *rseq = t->rseq.usrptr; u32 cpu_id = raw_smp_processor_id(); u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); @@ -176,7 +176,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t) if (rseq_validate_ro_fields(t)) goto efault; WARN_ON_ONCE((int) mm_cid < 0); - if (!user_write_access_begin(rseq, t->rseq_len)) + if (!user_write_access_begin(rseq, t->rseq.len)) goto efault; rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); @@ -201,7 +201,7 @@ efault: static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { - struct rseq __user *rseq = t->rseq; + struct rseq __user *rseq = t->rseq.usrptr; u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; @@ -211,7 +211,7 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) if (rseq_validate_ro_fields(t)) goto efault; - if (!user_write_access_begin(rseq, t->rseq_len)) + if (!user_write_access_begin(rseq, t->rseq.len)) goto efault; /* @@ -272,7 +272,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; - ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); + ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr); if (ret) return ret; @@ -305,10 +305,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) if (ret) return ret; - if (current->rseq_sig != sig) { + if (current->rseq.sig != sig) { printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", - sig, current->rseq_sig, current->pid, usig); + sig, current->rseq.sig, current->pid, usig); return -EINVAL; } return 0; @@ -338,7 +338,7 @@ static int rseq_check_flags(struct task_struct *t, u32 cs_flags) return -EINVAL; /* Get thread flags. */ - ret = get_user(flags, &t->rseq->flags); + ret = get_user(flags, &t->rseq.usrptr->flags); if (ret) return ret; @@ -392,13 +392,13 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort) * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t->rseq); + return clear_rseq_cs(t->rseq.usrptr); ret = rseq_check_flags(t, rseq_cs.flags); if (ret < 0) return ret; if (!abort) return 0; - ret = clear_rseq_cs(t->rseq); + ret = clear_rseq_cs(t->rseq.usrptr); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, @@ -460,8 +460,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) * inconsistencies. */ scoped_guard(RSEQ_EVENT_GUARD) { - event = t->rseq_event_pending; - t->rseq_event_pending = false; + event = t->rseq.event.sched_switch; + t->rseq.event.sched_switch = false; } if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) @@ -492,7 +492,7 @@ void rseq_syscall(struct pt_regs *regs) struct task_struct *t = current; struct rseq_cs rseq_cs; - if (!t->rseq) + if (!t->rseq.usrptr) return; if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) force_sig(SIGSEGV); @@ -511,33 +511,31 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ - if (current->rseq != rseq || !current->rseq) + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) return -EINVAL; - if (rseq_len != current->rseq_len) + if (rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; ret = rseq_reset_rseq_cpu_node_id(current); if (ret) return ret; - current->rseq = NULL; - current->rseq_sig = 0; - current->rseq_len = 0; + rseq_reset(current); return 0; } if (unlikely(flags)) return -EINVAL; - if (current->rseq) { + if (current->rseq.usrptr) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != current->rseq_len) + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; @@ -586,15 +584,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * Activate the registration by setting the rseq area address, length * and signature in the task struct. */ - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; + current->rseq.usrptr = rseq; + current->rseq.len = rseq_len; + current->rseq.sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ + current->rseq.event.has_rseq = true; rseq_sched_switch_event(current); return 0; -- cgit v1.2.3 From 5204be16790f305febbf331d0ec2cead7978b3c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:36 +0100 Subject: entry: Clean up header Clean up the include ordering, kernel-doc and other trivialities before making further changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.590338411@linutronix.de --- include/linux/entry-common.h | 8 ++++---- include/linux/irq-entry-common.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 7177436f0f9e..c585221ff16b 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -3,11 +3,11 @@ #define __LINUX_ENTRYCOMMON_H #include +#include #include +#include #include #include -#include -#include #include #include @@ -37,6 +37,7 @@ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ ARCH_SYSCALL_WORK_ENTER) + #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ @@ -61,8 +62,7 @@ */ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work); +long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); /** * syscall_enter_from_user_mode_work - Check and handle work before invoking diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index e5941df13901..9b1f386ffeb1 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -68,6 +68,7 @@ static __always_inline bool arch_in_rcu_eqs(void) { return false; } /** * enter_from_user_mode - Establish state when coming from user mode + * @regs: Pointer to currents pt_regs * * Syscall/interrupt entry disables interrupts, but user mode is traced as * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. @@ -357,6 +358,7 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); * Conditional reschedule with additional sanity checks. */ void raw_irqentry_exit_cond_resched(void); + #ifdef CONFIG_PREEMPT_DYNAMIC #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched -- cgit v1.2.3 From 54a5ab56242f96555999aaa41228f77b4a76e386 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:38 +0100 Subject: entry: Remove syscall_enter_from_user_mode_prepare() Open code the only user in the x86 syscall code and reduce the zoo of functions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.652839989@linutronix.de --- arch/x86/entry/syscall_32.c | 3 ++- include/linux/entry-common.h | 26 +++++--------------------- kernel/entry/syscall-common.c | 8 -------- 3 files changed, 7 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 2b15ea17bb7c..a67a644d0cfe 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -274,9 +274,10 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) * fetch EBP before invoking any of the syscall entry work * functions. */ - syscall_enter_from_user_mode_prepare(regs); + enter_from_user_mode(regs); instrumentation_begin(); + local_irq_enable(); /* Fetch EBP from where the vDSO stashed it. */ if (IS_ENABLED(CONFIG_X86_64)) { /* diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index c585221ff16b..75b194c34e18 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -45,23 +45,6 @@ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) -/** - * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts - * @regs: Pointer to currents pt_regs - * - * Invoked from architecture specific syscall entry code with interrupts - * disabled. The calling code has to be non-instrumentable. When the - * function returns all state is correct, interrupts are enabled and the - * subsequent functions can be instrumented. - * - * This handles lockdep, RCU (context tracking) and tracing state, i.e. - * the functionality provided by enter_from_user_mode(). - * - * This is invoked when there is extra architecture specific functionality - * to be done between establishing state and handling user mode entry work. - */ -void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); - long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); /** @@ -71,8 +54,8 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work) * @syscall: The syscall number * * Invoked from architecture specific syscall entry code with interrupts - * enabled after invoking syscall_enter_from_user_mode_prepare() and extra - * architecture specific work. + * enabled after invoking enter_from_user_mode(), enabling interrupts and + * extra architecture specific work. * * Returns: The original or a modified syscall number * @@ -108,8 +91,9 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re * function returns all state is correct, interrupts are enabled and the * subsequent functions can be instrumented. * - * This is combination of syscall_enter_from_user_mode_prepare() and - * syscall_enter_from_user_mode_work(). + * This is the combination of enter_from_user_mode() and + * syscall_enter_from_user_mode_work() to be used when there is no + * architecture specific work to be done between the two. * * Returns: The original or a modified syscall number. See * syscall_enter_from_user_mode_work() for further explanation. diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index 66e6ba7fa80c..940a597ded40 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -63,14 +63,6 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall; } -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) -{ - enter_from_user_mode(regs); - instrumentation_begin(); - local_irq_enable(); - instrumentation_end(); -} - /* * If SYSCALL_EMU is set, then the only reason to report is when * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall -- cgit v1.2.3 From 7702a9c2856794b6bf961b408eba3bacb753bd5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:40 +0100 Subject: entry: Inline irqentry_enter/exit_from/to_user_mode() There is no point to have this as a function which just inlines enter_from_user_mode(). The function call overhead is larger than the function itself. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.715309918@linutronix.de --- include/linux/irq-entry-common.h | 13 +++++++++++-- kernel/entry/common.c | 13 ------------- 2 files changed, 11 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 9b1f386ffeb1..83c9d841d9e1 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -278,7 +278,10 @@ static __always_inline void exit_to_user_mode(void) * * The function establishes state (lockdep, RCU (context tracking), tracing) */ -void irqentry_enter_from_user_mode(struct pt_regs *regs); +static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); +} /** * irqentry_exit_to_user_mode - Interrupt exit work @@ -293,7 +296,13 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs); * Interrupt exit is not invoking #1 which is the syscall specific one time * work. */ -void irqentry_exit_to_user_mode(struct pt_regs *regs); +static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} #ifndef irqentry_state /** diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f62e1d1b2063..70a16db4cc0a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -62,19 +62,6 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) -{ - enter_from_user_mode(regs); -} - -noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - exit_to_user_mode_prepare(regs); - instrumentation_end(); - exit_to_user_mode(); -} - noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) { irqentry_state_t ret = { -- cgit v1.2.3 From 4fc9225d19ad6289c03340a520d35e3a6d1aebed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:42 +0100 Subject: sched: Move MM CID related functions to sched.h There is nothing mm specific in that and including mm.h can cause header recursion hell. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.778457951@linutronix.de --- include/linux/mm.h | 25 ------------------------- include/linux/sched.h | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..17cfbba9914c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2401,31 +2401,6 @@ struct zap_details { /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) -#ifdef CONFIG_SCHED_MM_CID -void sched_mm_cid_before_execve(struct task_struct *t); -void sched_mm_cid_after_execve(struct task_struct *t); -void sched_mm_cid_fork(struct task_struct *t); -void sched_mm_cid_exit_signals(struct task_struct *t); -static inline int task_mm_cid(struct task_struct *t) -{ - return t->mm_cid; -} -#else -static inline void sched_mm_cid_before_execve(struct task_struct *t) { } -static inline void sched_mm_cid_after_execve(struct task_struct *t) { } -static inline void sched_mm_cid_fork(struct task_struct *t) { } -static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } -static inline int task_mm_cid(struct task_struct *t) -{ - /* - * Use the processor id as a fall-back when the mm cid feature is - * disabled. This provides functional per-cpu data structure accesses - * in user-space, althrough it won't provide the memory usage benefits. - */ - return raw_smp_processor_id(); -} -#endif - #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else diff --git a/include/linux/sched.h b/include/linux/sched.h index 15627769409d..24a9da7ca3e7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2310,6 +2310,32 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +/* Avoids recursive inclusion hell */ +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit_signals(struct task_struct *t); +static inline int task_mm_cid(struct task_struct *t) +{ + return t->mm_cid; +} +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline int task_mm_cid(struct task_struct *t) +{ + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return task_cpu(t); +} +#endif + #ifndef MODULE #ifndef COMPILE_OFFSETS -- cgit v1.2.3 From 4b7de6df20d43dd651031aef8d818fa5da981dbf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:45 +0100 Subject: rseq: Cache CPU ID and MM CID values In preparation for rewriting RSEQ exit to user space handling provide storage to cache the CPU ID and MM CID values which were written to user space. That prepares for a quick check, which avoids the update when nothing changed. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.841964081@linutronix.de --- include/linux/rseq.h | 7 +++++-- include/linux/rseq_types.h | 21 +++++++++++++++++++++ include/trace/events/rseq.h | 4 ++-- kernel/rseq.c | 4 ++++ 4 files changed, 32 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index ab91b1e6bb4a..d315a92afb36 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -57,6 +57,7 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { memset(&t->rseq, 0, sizeof(t->rseq)); + t->rseq.ids.cpu_cid = ~0ULL; } static inline void rseq_execve(struct task_struct *t) @@ -70,10 +71,12 @@ static inline void rseq_execve(struct task_struct *t) */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { - if (clone_flags & CLONE_VM) + if (clone_flags & CLONE_VM) { rseq_reset(t); - else + } else { t->rseq = current->rseq; + t->rseq.ids.cpu_cid = ~0ULL; + } } #else /* CONFIG_RSEQ */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index f7a60c8eddc9..40901b033b92 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -30,18 +30,39 @@ struct rseq_event { }; }; +/** + * struct rseq_ids - Cache for ids, which need to be updated + * @cpu_cid: Compound of @cpu_id and @mm_cid to make the + * compiler emit a single compare on 64-bit + * @cpu_id: The CPU ID which was written last to user space + * @mm_cid: The MM CID which was written last to user space + * + * @cpu_id and @mm_cid are updated when the data is written to user space. + */ +struct rseq_ids { + union { + u64 cpu_cid; + struct { + u32 cpu_id; + u32 mm_cid; + }; + }; +}; + /** * struct rseq_data - Storage for all rseq related data * @usrptr: Pointer to the registered user space RSEQ memory * @len: Length of the RSEQ region * @sig: Signature of critial section abort IPs * @event: Storage for event management + * @ids: Storage for cached CPU ID and MM CID */ struct rseq_data { struct rseq __user *usrptr; u32 len; u32 sig; struct rseq_event event; + struct rseq_ids ids; }; #else /* CONFIG_RSEQ */ diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h index 823b47d1ba1e..ce85d650bf4b 100644 --- a/include/trace/events/rseq.h +++ b/include/trace/events/rseq.h @@ -21,9 +21,9 @@ TRACE_EVENT(rseq_update, ), TP_fast_assign( - __entry->cpu_id = raw_smp_processor_id(); + __entry->cpu_id = t->rseq.ids.cpu_id; __entry->node_id = cpu_to_node(__entry->cpu_id); - __entry->mm_cid = task_mm_cid(t); + __entry->mm_cid = t->rseq.ids.mm_cid; ), TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id, diff --git a/kernel/rseq.c b/kernel/rseq.c index aae62661e6bb..ad1e7cecd527 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -184,6 +184,10 @@ static int rseq_update_cpu_node_id(struct task_struct *t) rseq_unsafe_put_user(t, node_id, node_id, efault_end); rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); + /* Cache the user space values */ + t->rseq.ids.cpu_id = cpu_id; + t->rseq.ids.mm_cid = mm_cid; + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if -- cgit v1.2.3 From 2fc0e4b4126caadfa5772ba69276b350609584dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:48 +0100 Subject: rseq: Record interrupt from user space For RSEQ the only relevant reason to inspect and eventually fixup (abort) user space critical sections is when user space was interrupted and the task was scheduled out. If the user to kernel entry was from a syscall no fixup is required. If user space invokes a syscall from a critical section it can keep the pieces as documented. This is only supported on architectures which utilize the generic entry code. If your architecture does not use it, bad luck. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.905067101@linutronix.de --- include/linux/irq-entry-common.h | 3 ++- include/linux/rseq.h | 16 +++++++++++----- include/linux/rseq_entry.h | 18 ++++++++++++++++++ include/linux/rseq_types.h | 2 ++ 4 files changed, 33 insertions(+), 6 deletions(-) create mode 100644 include/linux/rseq_entry.h (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 83c9d841d9e1..cb31fb84d7b4 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include #include @@ -281,6 +281,7 @@ static __always_inline void exit_to_user_mode(void) static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); + rseq_note_user_irq_entry(); } /** diff --git a/include/linux/rseq.h b/include/linux/rseq.h index d315a92afb36..a200836a6fe3 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -31,11 +31,17 @@ static inline void rseq_sched_switch_event(struct task_struct *t) static __always_inline void rseq_exit_to_user_mode(void) { - if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - if (WARN_ON_ONCE(current->rseq.event.has_rseq && - current->rseq.event.events)) - current->rseq.event.events = 0; - } + struct rseq_event *ev = ¤t->rseq.event; + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) + WARN_ON_ONCE(ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing did not clear it. + */ + ev->events = 0; } /* diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h new file mode 100644 index 000000000000..ce30e87ce1f5 --- /dev/null +++ b/include/linux/rseq_entry.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RSEQ_ENTRY_H +#define _LINUX_RSEQ_ENTRY_H + +#ifdef CONFIG_RSEQ +#include + +static __always_inline void rseq_note_user_irq_entry(void) +{ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) + current->rseq.event.user_irq = true; +} + +#else /* CONFIG_RSEQ */ +static inline void rseq_note_user_irq_entry(void) { } +#endif /* !CONFIG_RSEQ */ + +#endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 40901b033b92..80f6c398ef0f 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -12,6 +12,7 @@ struct rseq; * @all: Compound to initialize and clear the data efficiently * @events: Compound to access events with a single load/store * @sched_switch: True if the task was scheduled out + * @user_irq: True on interrupt entry from user mode * @has_rseq: True if the task has a rseq pointer installed */ struct rseq_event { @@ -22,6 +23,7 @@ struct rseq_event { u16 events; struct { u8 sched_switch; + u8 user_irq; }; }; -- cgit v1.2.3 From dab344753e021fe84c24f9d8b0b63cb5bcf463d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:50 +0100 Subject: rseq: Provide tracepoint wrappers for inline code Provide tracepoint wrappers for the upcoming RSEQ exit to user space inline fast path, so that the header can be safely included by code which defines actual trace points. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.967114316@linutronix.de --- include/linux/rseq_entry.h | 28 ++++++++++++++++++++++++++++ kernel/rseq.c | 19 ++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index ce30e87ce1f5..5be507a127eb 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -5,6 +5,34 @@ #ifdef CONFIG_RSEQ #include +#include + +#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(rseq_update); +DECLARE_TRACEPOINT(rseq_ip_fixup); +void __rseq_trace_update(struct task_struct *t); +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip); + +static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) +{ + if (tracepoint_enabled(rseq_update) && ids) + __rseq_trace_update(t); +} + +static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + if (tracepoint_enabled(rseq_ip_fixup)) + __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); +} + +#else /* CONFIG_TRACEPOINT */ +static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } +static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) { } +#endif /* !CONFIG_TRACEPOINT */ + static __always_inline void rseq_note_user_irq_entry(void) { if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) diff --git a/kernel/rseq.c b/kernel/rseq.c index ad1e7cecd527..f49d3118c3e0 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -70,7 +70,7 @@ #include #include #include -#include +#include #include #include #include @@ -91,6 +91,23 @@ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) +#ifdef CONFIG_TRACEPOINTS +/* + * Out of line, so the actual update functions can be in a header to be + * inlined into the exit to user code. + */ +void __rseq_trace_update(struct task_struct *t) +{ + trace_rseq_update(t); +} + +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); +} +#endif /* CONFIG_TRACEPOINTS */ + #ifdef CONFIG_DEBUG_RSEQ static struct rseq *rseq_kernel_fields(struct task_struct *t) { -- cgit v1.2.3 From 5412910487d0839111e4f2f3a6f33f6c9af9b007 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:52 +0100 Subject: rseq: Expose lightweight statistics in debugfs Analyzing the call frequency without actually using tracing is helpful for analysis of this infrastructure. The overhead is minimal as it just increments a per CPU counter associated to each operation. The debugfs readout provides a racy sum of all counters. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.027916598@linutronix.de --- include/linux/rseq.h | 16 ---------- include/linux/rseq_entry.h | 49 ++++++++++++++++++++++++++++ init/Kconfig | 12 +++++++ kernel/rseq.c | 79 ++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 133 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index a200836a6fe3..7f347c3a4af8 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -29,21 +29,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t) } } -static __always_inline void rseq_exit_to_user_mode(void) -{ - struct rseq_event *ev = ¤t->rseq.event; - - if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) - WARN_ON_ONCE(ev->sched_switch); - - /* - * Ensure that event (especially user_irq) is cleared when the - * interrupt did not result in a schedule and therefore the - * rseq processing did not clear it. - */ - ev->events = 0; -} - /* * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in @@ -92,7 +77,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } -static inline void rseq_exit_to_user_mode(void) { } #endif /* !CONFIG_RSEQ */ #ifdef CONFIG_DEBUG_RSEQ diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 5be507a127eb..ff9080b89be3 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -2,6 +2,37 @@ #ifndef _LINUX_RSEQ_ENTRY_H #define _LINUX_RSEQ_ENTRY_H +/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ +#ifdef CONFIG_RSEQ_STATS +#include + +struct rseq_stats { + unsigned long exit; + unsigned long signal; + unsigned long slowpath; + unsigned long ids; + unsigned long cs; + unsigned long clear; + unsigned long fixup; +}; + +DECLARE_PER_CPU(struct rseq_stats, rseq_stats); + +/* + * Slow path has interrupts and preemption enabled, but the fast path + * runs with interrupts disabled so there is no point in having the + * preemption checks implied in __this_cpu_inc() for every operation. + */ +#ifdef RSEQ_BUILD_SLOW_PATH +#define rseq_stat_inc(which) this_cpu_inc((which)) +#else +#define rseq_stat_inc(which) raw_cpu_inc((which)) +#endif + +#else /* CONFIG_RSEQ_STATS */ +#define rseq_stat_inc(x) do { } while (0) +#endif /* !CONFIG_RSEQ_STATS */ + #ifdef CONFIG_RSEQ #include @@ -39,8 +70,26 @@ static __always_inline void rseq_note_user_irq_entry(void) current->rseq.event.user_irq = true; } +static __always_inline void rseq_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) + WARN_ON_ONCE(ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing did not clear it. + */ + ev->events = 0; +} + #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } +static inline void rseq_exit_to_user_mode(void) { } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/init/Kconfig b/init/Kconfig index cab3ad28ca49..f39fdfb28797 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1913,6 +1913,18 @@ config RSEQ If unsure, say Y. +config RSEQ_STATS + default n + bool "Enable lightweight statistics of restartable sequences" if EXPERT + depends on RSEQ && DEBUG_FS + help + Enable lightweight counters which expose information about the + frequency of RSEQ operations via debugfs. Mostly interesting for + kernel debugging or performance analysis. While lightweight it's + still adding code into the user/kernel mode transitions. + + If unsure, say N. + config DEBUG_RSEQ default n bool "Enable debugging of rseq() system call" if EXPERT diff --git a/kernel/rseq.c b/kernel/rseq.c index f49d3118c3e0..c0dbe2ed26a8 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -67,12 +67,16 @@ * F1. */ +/* Required to select the proper per_cpu ops for rseq_stats_inc() */ +#define RSEQ_BUILD_SLOW_PATH + +#include +#include +#include #include -#include #include -#include +#include #include -#include #include #define CREATE_TRACE_POINTS @@ -108,6 +112,56 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, } #endif /* CONFIG_TRACEPOINTS */ +#ifdef CONFIG_RSEQ_STATS +DEFINE_PER_CPU(struct rseq_stats, rseq_stats); + +static int rseq_debug_show(struct seq_file *m, void *p) +{ + struct rseq_stats stats = { }; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); + } + + seq_printf(m, "exit: %16lu\n", stats.exit); + seq_printf(m, "signal: %16lu\n", stats.signal); + seq_printf(m, "slowp: %16lu\n", stats.slowpath); + seq_printf(m, "ids: %16lu\n", stats.ids); + seq_printf(m, "cs: %16lu\n", stats.cs); + seq_printf(m, "clear: %16lu\n", stats.clear); + seq_printf(m, "fixup: %16lu\n", stats.fixup); + return 0; +} + +static int rseq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_debug_show, inode->i_private); +} + +static const struct file_operations dfs_ops = { + .open = rseq_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_debugfs_init(void) +{ + struct dentry *root_dir = debugfs_create_dir("rseq", NULL); + + debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops); + return 0; +} +__initcall(rseq_debugfs_init); +#endif /* CONFIG_RSEQ_STATS */ + #ifdef CONFIG_DEBUG_RSEQ static struct rseq *rseq_kernel_fields(struct task_struct *t) { @@ -187,12 +241,13 @@ static int rseq_update_cpu_node_id(struct task_struct *t) u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); - /* - * Validate read-only rseq fields. - */ + rseq_stat_inc(rseq_stats.ids); + + /* Validate read-only rseq fields on debug kernels */ if (rseq_validate_ro_fields(t)) goto efault; WARN_ON_ONCE((int) mm_cid < 0); + if (!user_write_access_begin(rseq, t->rseq.len)) goto efault; @@ -403,6 +458,8 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort) struct rseq_cs rseq_cs; int ret; + rseq_stat_inc(rseq_stats.cs); + ret = rseq_get_rseq_cs(t, &rseq_cs); if (ret) return ret; @@ -412,8 +469,10 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort) * If not nested over a rseq critical section, restart is useless. * Clear the rseq_cs pointer and return. */ - if (!in_rseq_cs(ip, &rseq_cs)) + if (!in_rseq_cs(ip, &rseq_cs)) { + rseq_stat_inc(rseq_stats.clear); return clear_rseq_cs(t->rseq.usrptr); + } ret = rseq_check_flags(t, rseq_cs.flags); if (ret < 0) return ret; @@ -422,6 +481,7 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort) ret = clear_rseq_cs(t->rseq.usrptr); if (ret) return ret; + rseq_stat_inc(rseq_stats.fixup); trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, rseq_cs.abort_ip); instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); @@ -462,6 +522,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; + if (ksig) + rseq_stat_inc(rseq_stats.signal); + else + rseq_stat_inc(rseq_stats.slowpath); + /* * Read and clear the event pending bit first. If the task * was not preempted or migrated or a signal is on the way, -- cgit v1.2.3 From 9c37cb6e80b8fcdddc1236ba42ffd438f511192b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:55 +0100 Subject: rseq: Provide static branch for runtime debugging Config based debug is rarely turned on and is not available easily when things go wrong. Provide a static branch to allow permanent integration of debug mechanisms along with the usual toggles in Kconfig, command line and debugfs. Requested-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.089270547@linutronix.de --- Documentation/admin-guide/kernel-parameters.txt | 4 ++ include/linux/rseq_entry.h | 3 + init/Kconfig | 14 +++++ kernel/rseq.c | 73 +++++++++++++++++++++++-- 4 files changed, 90 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6c42061ca20e..e63827475792 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6500,6 +6500,10 @@ Memory area to be used by remote processor image, managed by CMA. + rseq_debug= [KNL] Enable or disable restartable sequence + debug mode. Defaults to CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE. + Format: + rt_group_sched= [KNL] Enable or disable SCHED_RR/FIFO group scheduling when CONFIG_RT_GROUP_SCHED=y. Defaults to !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED. diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index ff9080b89be3..ed8e5f89499b 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -34,6 +34,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #endif /* !CONFIG_RSEQ_STATS */ #ifdef CONFIG_RSEQ +#include #include #include @@ -64,6 +65,8 @@ static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, unsigned long offset, unsigned long abort_ip) { } #endif /* !CONFIG_TRACEPOINT */ +DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); + static __always_inline void rseq_note_user_irq_entry(void) { if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) diff --git a/init/Kconfig b/init/Kconfig index f39fdfb28797..bde40ab664e2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1925,10 +1925,24 @@ config RSEQ_STATS If unsure, say N. +config RSEQ_DEBUG_DEFAULT_ENABLE + default n + bool "Enable restartable sequences debug mode by default" if EXPERT + depends on RSEQ + help + This enables the static branch for debug mode of restartable + sequences. + + This also can be controlled on the kernel command line via the + command line parameter "rseq_debug=0/1" and through debugfs. + + If unsure, say N. + config DEBUG_RSEQ default n bool "Enable debugging of rseq() system call" if EXPERT depends on RSEQ && DEBUG_KERNEL + select RSEQ_DEBUG_DEFAULT_ENABLE help Enable extra debugging checks for the rseq system call. diff --git a/kernel/rseq.c b/kernel/rseq.c index c0dbe2ed26a8..679ab8ebdfd3 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -95,6 +95,27 @@ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) +DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); + +static inline void rseq_control_debug(bool on) +{ + if (on) + static_branch_enable(&rseq_debug_enabled); + else + static_branch_disable(&rseq_debug_enabled); +} + +static int __init rseq_setup_debug(char *str) +{ + bool on; + + if (kstrtobool(str, &on)) + return -EINVAL; + rseq_control_debug(on); + return 1; +} +__setup("rseq_debug=", rseq_setup_debug); + #ifdef CONFIG_TRACEPOINTS /* * Out of line, so the actual update functions can be in a header to be @@ -112,10 +133,11 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, } #endif /* CONFIG_TRACEPOINTS */ +#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_RSEQ_STATS DEFINE_PER_CPU(struct rseq_stats, rseq_stats); -static int rseq_debug_show(struct seq_file *m, void *p) +static int rseq_stats_show(struct seq_file *m, void *p) { struct rseq_stats stats = { }; unsigned int cpu; @@ -140,14 +162,56 @@ static int rseq_debug_show(struct seq_file *m, void *p) return 0; } +static int rseq_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_stats_show, inode->i_private); +} + +static const struct file_operations stat_ops = { + .open = rseq_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_stats_init(struct dentry *root_dir) +{ + debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); + return 0; +} +#else +static inline void rseq_stats_init(struct dentry *root_dir) { } +#endif /* CONFIG_RSEQ_STATS */ + +static int rseq_debug_show(struct seq_file *m, void *p) +{ + bool on = static_branch_unlikely(&rseq_debug_enabled); + + seq_printf(m, "%d\n", on); + return 0; +} + +static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + bool on; + + if (kstrtobool_from_user(ubuf, count, &on)) + return -EINVAL; + + rseq_control_debug(on); + return count; +} + static int rseq_debug_open(struct inode *inode, struct file *file) { return single_open(file, rseq_debug_show, inode->i_private); } -static const struct file_operations dfs_ops = { +static const struct file_operations debug_ops = { .open = rseq_debug_open, .read = seq_read, + .write = rseq_debug_write, .llseek = seq_lseek, .release = single_release, }; @@ -156,11 +220,12 @@ static int __init rseq_debugfs_init(void) { struct dentry *root_dir = debugfs_create_dir("rseq", NULL); - debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops); + debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); + rseq_stats_init(root_dir); return 0; } __initcall(rseq_debugfs_init); -#endif /* CONFIG_RSEQ_STATS */ +#endif /* CONFIG_DEBUG_FS */ #ifdef CONFIG_DEBUG_RSEQ static struct rseq *rseq_kernel_fields(struct task_struct *t) -- cgit v1.2.3 From abc850e7616c91ebaa3f5ba3617ab0a104d45039 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:57 +0100 Subject: rseq: Provide and use rseq_update_user_cs() Provide a straight forward implementation to check for and eventually clear/fixup critical sections in user space. The non-debug version does only the minimal sanity checks and aims for efficiency. There are two attack vectors, which are checked for: 1) An abort IP which is in the kernel address space. That would cause at least x86 to return to kernel space via IRET. 2) A rogue critical section descriptor with an abort IP pointing to some arbitrary address, which is not preceded by the RSEQ signature. If the section descriptors are invalid then the resulting misbehaviour of the user space application is not the kernels problem. The kernel provides a run-time switchable debug slow path, which implements the full zoo of checks including termination of the task when one of the gazillion conditions is not met. Replace the zoo in rseq.c with it and invoke it from the TIF_NOTIFY_RESUME handler. Move the remainders into the CONFIG_DEBUG_RSEQ section, which will be replaced and removed in a subsequent step. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.151465632@linutronix.de --- include/linux/rseq_entry.h | 206 +++++++++++++++++++++++++++++++++++++ include/linux/rseq_types.h | 11 +- kernel/rseq.c | 246 ++++++++++++++------------------------------- 3 files changed, 291 insertions(+), 172 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index ed8e5f89499b..f9510ce72211 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -36,6 +36,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #ifdef CONFIG_RSEQ #include #include +#include #include @@ -67,12 +68,217 @@ static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); +#ifdef RSEQ_BUILD_SLOW_PATH +#define rseq_inline +#else +#define rseq_inline __always_inline +#endif + +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); + static __always_inline void rseq_note_user_irq_entry(void) { if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) current->rseq.event.user_irq = true; } +/* + * Check whether there is a valid critical section and whether the + * instruction pointer in @regs is inside the critical section. + * + * - If the critical section is invalid, terminate the task. + * + * - If valid and the instruction pointer is inside, set it to the abort IP. + * + * - If valid and the instruction pointer is outside, clear the critical + * section address. + * + * Returns true, if the section was valid and either fixup or clear was + * done, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when a invalid + * section was found. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ + +#ifdef RSEQ_BUILD_SLOW_PATH +/* + * The debug version is put out of line, but kept here so the code stays + * together. + * + * @csaddr has already been checked by the caller to be in user space + */ +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, + unsigned long csaddr) +{ + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; + u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; + unsigned long ip = instruction_pointer(regs); + u64 __user *uc_head = (u64 __user *) ucs; + u32 usig, __user *uc_sig; + + scoped_user_rw_access(ucs, efault) { + /* + * Evaluate the user pile and exit if one of the conditions + * is not fulfilled. + */ + unsafe_get_user(start_ip, &ucs->start_ip, efault); + if (unlikely(start_ip >= tasksize)) + goto die; + /* If outside, just clear the critical section. */ + if (ip < start_ip) + goto clear; + + unsafe_get_user(offset, &ucs->post_commit_offset, efault); + cs_end = start_ip + offset; + /* Check for overflow and wraparound */ + if (unlikely(cs_end >= tasksize || cs_end < start_ip)) + goto die; + + /* If not inside, clear it. */ + if (ip >= cs_end) + goto clear; + + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); + /* Ensure it's "valid" */ + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) + goto die; + /* Validate that the abort IP is not in the critical section */ + if (unlikely(abort_ip - start_ip < offset)) + goto die; + + /* + * Check version and flags for 0. No point in emitting + * deprecated warnings before dying. That could be done in + * the slow path eventually, but *shrug*. + */ + unsafe_get_user(head, uc_head, efault); + if (unlikely(head)) + goto die; + + /* abort_ip - 4 is >= 0. See abort_ip check above */ + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); + unsafe_get_user(usig, uc_sig, efault); + if (unlikely(usig != t->rseq.sig)) + goto die; + + /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* If not in interrupt from user context, let it die */ + if (unlikely(!t->rseq.event.user_irq)) + goto die; + } + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + instruction_pointer_set(regs, (unsigned long)abort_ip); + rseq_stat_inc(rseq_stats.fixup); + break; + clear: + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + rseq_stat_inc(rseq_stats.clear); + abort_ip = 0ULL; + } + + if (unlikely(abort_ip)) + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + +#endif /* RSEQ_BUILD_SLOW_PATH */ + +/* + * This only ensures that abort_ip is in the user address space and + * validates that it is preceded by the signature. + * + * No other sanity checks are done here, that's what the debug code is for. + */ +static rseq_inline bool +rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) +{ + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; + unsigned long ip = instruction_pointer(regs); + u64 start_ip, abort_ip, offset; + u32 usig, __user *uc_sig; + + rseq_stat_inc(rseq_stats.cs); + + if (unlikely(csaddr >= TASK_SIZE)) { + t->rseq.event.fatal = true; + return false; + } + + if (static_branch_unlikely(&rseq_debug_enabled)) + return rseq_debug_update_user_cs(t, regs, csaddr); + + scoped_user_rw_access(ucs, efault) { + unsafe_get_user(start_ip, &ucs->start_ip, efault); + unsafe_get_user(offset, &ucs->post_commit_offset, efault); + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); + + /* + * No sanity checks. If user space screwed it up, it can + * keep the pieces. That's what debug code is for. + * + * If outside, just clear the critical section. + */ + if (ip - start_ip >= offset) + goto clear; + + /* + * Two requirements for @abort_ip: + * - Must be in user space as x86 IRET would happily return to + * the kernel. + * - The four bytes preceding the instruction at @abort_ip must + * contain the signature. + * + * The latter protects against the following attack vector: + * + * An attacker with limited abilities to write, creates a critical + * section descriptor, sets the abort IP to a library function or + * some other ROP gadget and stores the address of the descriptor + * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP + * protection. + */ + if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig)) + goto die; + + /* The address is guaranteed to be >= 0 and < TASK_SIZE */ + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); + unsafe_get_user(usig, uc_sig, efault); + if (unlikely(usig != t->rseq.sig)) + goto die; + + /* Invalidate the critical section */ + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + /* Update the instruction pointer */ + instruction_pointer_set(regs, (unsigned long)abort_ip); + rseq_stat_inc(rseq_stats.fixup); + break; + clear: + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + rseq_stat_inc(rseq_stats.clear); + abort_ip = 0ULL; + } + + if (unlikely(abort_ip)) + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 80f6c398ef0f..7c123947bb98 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -14,10 +14,12 @@ struct rseq; * @sched_switch: True if the task was scheduled out * @user_irq: True on interrupt entry from user mode * @has_rseq: True if the task has a rseq pointer installed + * @error: Compound error code for the slow path to analyze + * @fatal: User space data corrupted or invalid */ struct rseq_event { union { - u32 all; + u64 all; struct { union { u16 events; @@ -28,6 +30,13 @@ struct rseq_event { }; u8 has_rseq; + u8 __pad; + union { + u16 error; + struct { + u8 fatal; + }; + }; }; }; }; diff --git a/kernel/rseq.c b/kernel/rseq.c index 679ab8ebdfd3..12a9b6ab2cef 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -382,175 +382,18 @@ efault: return -EFAULT; } -/* - * Get the user-space pointer value stored in the 'rseq_cs' field. - */ -static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) -{ - if (!rseq_cs) - return -EFAULT; - -#ifdef CONFIG_64BIT - if (get_user(*rseq_cs, &rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) - return -EFAULT; -#endif - - return 0; -} - -/* - * If the rseq_cs field of 'struct rseq' contains a valid pointer to - * user-space, copy 'struct rseq_cs' from user-space and validate its fields. - */ -static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) -{ - struct rseq_cs __user *urseq_cs; - u64 ptr; - u32 __user *usig; - u32 sig; - int ret; - - ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr); - if (ret) - return ret; - - /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ - if (!ptr) { - memset(rseq_cs, 0, sizeof(*rseq_cs)); - return 0; - } - /* Check that the pointer value fits in the user-space process space. */ - if (ptr >= TASK_SIZE) - return -EINVAL; - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) - return -EFAULT; - - if (rseq_cs->start_ip >= TASK_SIZE || - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || - rseq_cs->abort_ip >= TASK_SIZE || - rseq_cs->version > 0) - return -EINVAL; - /* Check for overflow. */ - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) - return -EINVAL; - /* Ensure that abort_ip is not in the critical section. */ - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) - return -EINVAL; - - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); - ret = get_user(sig, usig); - if (ret) - return ret; - - if (current->rseq.sig != sig) { - printk_ratelimited(KERN_WARNING - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", - sig, current->rseq.sig, current->pid, usig); - return -EINVAL; - } - return 0; -} - -static bool rseq_warn_flags(const char *str, u32 flags) -{ - u32 test_flags; - - if (!flags) - return false; - test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); - test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); - return true; -} - -static int rseq_check_flags(struct task_struct *t, u32 cs_flags) -{ - u32 flags; - int ret; - - if (rseq_warn_flags("rseq_cs", cs_flags)) - return -EINVAL; - - /* Get thread flags. */ - ret = get_user(flags, &t->rseq.usrptr->flags); - if (ret) - return ret; - - if (rseq_warn_flags("rseq", flags)) - return -EINVAL; - return 0; -} - -static int clear_rseq_cs(struct rseq __user *rseq) -{ - /* - * The rseq_cs field is set to NULL on preemption or signal - * delivery on top of rseq assembly block, as well as on top - * of code outside of the rseq assembly block. This performs - * a lazy clear of the rseq_cs field. - * - * Set rseq_cs to NULL. - */ -#ifdef CONFIG_64BIT - return put_user(0UL, &rseq->rseq_cs); -#else - if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) - return -EFAULT; - return 0; -#endif -} - -/* - * Unsigned comparison will be true when ip >= start_ip, and when - * ip < start_ip + post_commit_offset. - */ -static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) -{ - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; -} - -static int rseq_ip_fixup(struct pt_regs *regs, bool abort) +static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { - unsigned long ip = instruction_pointer(regs); - struct task_struct *t = current; - struct rseq_cs rseq_cs; - int ret; - - rseq_stat_inc(rseq_stats.cs); - - ret = rseq_get_rseq_cs(t, &rseq_cs); - if (ret) - return ret; - - /* - * Handle potentially not being within a critical section. - * If not nested over a rseq critical section, restart is useless. - * Clear the rseq_cs pointer and return. - */ - if (!in_rseq_cs(ip, &rseq_cs)) { - rseq_stat_inc(rseq_stats.clear); - return clear_rseq_cs(t->rseq.usrptr); - } - ret = rseq_check_flags(t, rseq_cs.flags); - if (ret < 0) - return ret; - if (!abort) - return 0; - ret = clear_rseq_cs(t->rseq.usrptr); - if (ret) - return ret; - rseq_stat_inc(rseq_stats.fixup); - trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, - rseq_cs.abort_ip); - instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); - return 0; + struct rseq __user *urseq = t->rseq.usrptr; + u64 csaddr; + + scoped_user_read_access(urseq, efault) + unsafe_get_user(csaddr, &urseq->rseq_cs, efault); + if (likely(!csaddr)) + return true; + return rseq_update_user_cs(t, regs, csaddr); +efault: + return false; } /* @@ -567,8 +410,8 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort) void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; - int ret, sig; bool event; + int sig; /* * If invoked from hypervisors before entering the guest via @@ -618,8 +461,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) return; - ret = rseq_ip_fixup(regs, event); - if (unlikely(ret < 0)) + if (!rseq_handle_cs(t, regs)) goto error; if (unlikely(rseq_update_cpu_node_id(t))) @@ -632,6 +474,68 @@ error: } #ifdef CONFIG_DEBUG_RSEQ +/* + * Unsigned comparison will be true when ip >= start_ip, and when + * ip < start_ip + post_commit_offset. + */ +static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +{ + return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; +} + +/* + * If the rseq_cs field of 'struct rseq' contains a valid pointer to + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. + */ +static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +{ + struct rseq __user *urseq = t->rseq.usrptr; + struct rseq_cs __user *urseq_cs; + u32 __user *usig; + u64 ptr; + u32 sig; + int ret; + + if (get_user(ptr, &rseq->rseq_cs)) + return -EFAULT; + + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ + if (!ptr) { + memset(rseq_cs, 0, sizeof(*rseq_cs)); + return 0; + } + /* Check that the pointer value fits in the user-space process space. */ + if (ptr >= TASK_SIZE) + return -EINVAL; + urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; + if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) + return -EFAULT; + + if (rseq_cs->start_ip >= TASK_SIZE || + rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || + rseq_cs->abort_ip >= TASK_SIZE || + rseq_cs->version > 0) + return -EINVAL; + /* Check for overflow. */ + if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) + return -EINVAL; + /* Ensure that abort_ip is not in the critical section. */ + if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) + return -EINVAL; + + usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); + ret = get_user(sig, usig); + if (ret) + return ret; + + if (current->rseq.sig != sig) { + printk_ratelimited(KERN_WARNING + "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", + sig, current->rseq.sig, current->pid, usig); + return -EINVAL; + } + return 0; +} /* * Terminate the process if a syscall is issued within a restartable -- cgit v1.2.3 From c1cbad8f99b5c73c6af6e96acbfa64eaaaeb085f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:02 +0100 Subject: rseq: Make exit debugging static branch based Disconnect it from the config switch and use the static debug branch. This is a temporary measure for validating the rework. At the end this check needs to be hidden behind lockdep as it has nothing to do with the other debug infrastructure, which mainly aids user space debugging by enabling a zoo of checks which terminate misbehaving tasks instead of letting them keep the hard to diagnose pieces. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.272660745@linutronix.de --- include/linux/rseq_entry.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index f9510ce72211..5bdcf5b5f595 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -285,7 +285,7 @@ static __always_inline void rseq_exit_to_user_mode(void) rseq_stat_inc(rseq_stats.exit); - if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) + if (static_branch_unlikely(&rseq_debug_enabled)) WARN_ON_ONCE(ev->sched_switch); /* -- cgit v1.2.3 From eaa9088d568c84afd72fa32dbe01833aef861d0d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:05 +0100 Subject: rseq: Use static branch for syscall exit debug when GENERIC_IRQ_ENTRY=y Make the syscall exit debug mechanism available via the static branch on architectures which utilize the generic entry code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.333440475@linutronix.de --- include/linux/entry-common.h | 2 +- include/linux/rseq_entry.h | 9 +++++++++ kernel/rseq.c | 10 ++++++++-- 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 75b194c34e18..d967184ae08f 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -146,7 +146,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) local_irq_enable(); } - rseq_syscall(regs); + rseq_debug_syscall_return(regs); /* * Do one-time syscall specific work. If these work items are diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 5bdcf5b5f595..fb53a6ff05d7 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -296,9 +296,18 @@ static __always_inline void rseq_exit_to_user_mode(void) ev->events = 0; } +void __rseq_debug_syscall_return(struct pt_regs *regs); + +static inline void rseq_debug_syscall_return(struct pt_regs *regs) +{ + if (static_branch_unlikely(&rseq_debug_enabled)) + __rseq_debug_syscall_return(regs); +} + #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } static inline void rseq_exit_to_user_mode(void) { } +static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/kernel/rseq.c b/kernel/rseq.c index abd6bfadbcc6..97631554ae96 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -473,12 +473,11 @@ error: force_sigsegv(sig); } -#ifdef CONFIG_DEBUG_RSEQ /* * Terminate the process if a syscall is issued within a restartable * sequence. */ -void rseq_syscall(struct pt_regs *regs) +void __rseq_debug_syscall_return(struct pt_regs *regs) { struct task_struct *t = current; u64 csaddr; @@ -496,6 +495,13 @@ void rseq_syscall(struct pt_regs *regs) fail: force_sig(SIGSEGV); } + +#ifdef CONFIG_DEBUG_RSEQ +/* Kept around to keep GENERIC_ENTRY=n architectures supported. */ +void rseq_syscall(struct pt_regs *regs) +{ + __rseq_debug_syscall_return(regs); +} #endif /* -- cgit v1.2.3 From 0f085b41880e3140efa6941ff2b8fd43bac4d659 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:08 +0100 Subject: rseq: Provide and use rseq_set_ids() Provide a new and straight forward implementation to set the IDs (CPU ID, Node ID and MM CID), which can be later inlined into the fast path. It does all operations in one scoped_user_rw_access() section and retrieves also the critical section member (rseq::cs_rseq) from user space to avoid another user..begin/end() pair. This is in preparation for optimizing the fast path to avoid extra work when not required. On rseq registration set the CPU ID fields to RSEQ_CPU_ID_UNINITIALIZED and node and MM CID to zero. That's the same as the kernel internal reset values. That makes the debug validation in the exit code work correctly on the first exit to user space. Use it to replace the whole related zoo in rseq.c Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.393972266@linutronix.de --- fs/binfmt_elf.c | 2 +- include/linux/rseq.h | 16 ++- include/linux/rseq_entry.h | 89 +++++++++++++++++ include/linux/sched.h | 10 -- kernel/rseq.c | 236 ++++++++++----------------------------------- 5 files changed, 151 insertions(+), 202 deletions(-) (limited to 'include/linux') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e4653bb99946..3eb734c192e9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7f347c3a4af8..92f9cd49489b 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -5,6 +5,8 @@ #ifdef CONFIG_RSEQ #include +#include + void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) @@ -48,7 +50,7 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { memset(&t->rseq, 0, sizeof(t->rseq)); - t->rseq.ids.cpu_cid = ~0ULL; + t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } static inline void rseq_execve(struct task_struct *t) @@ -59,15 +61,19 @@ static inline void rseq_execve(struct task_struct *t) /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. + * + * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault + * on the COW page on exit to user space, when the child stays on the same + * CPU as the parent. That's obviously not guaranteed, but in overcommit + * scenarios it is more likely and optimizes for the fork/exec case without + * taking the fault. */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { - if (clone_flags & CLONE_VM) { + if (clone_flags & CLONE_VM) rseq_reset(t); - } else { + else t->rseq = current->rseq; - t->rseq.ids.cpu_cid = ~0ULL; - } } #else /* CONFIG_RSEQ */ diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index fb53a6ff05d7..37444e80fd45 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -75,6 +75,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); #endif bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); +bool rseq_debug_validate_ids(struct task_struct *t); static __always_inline void rseq_note_user_irq_entry(void) { @@ -194,6 +195,43 @@ efault: return false; } +/* + * On debug kernels validate that user space did not mess with it if the + * debug branch is enabled. + */ +bool rseq_debug_validate_ids(struct task_struct *t) +{ + struct rseq __user *rseq = t->rseq.usrptr; + u32 cpu_id, uval, node_id; + + /* + * On the first exit after registering the rseq region CPU ID is + * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! + */ + node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? + cpu_to_node(t->rseq.ids.cpu_id) : 0; + + scoped_user_read_access(rseq, efault) { + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + #endif /* RSEQ_BUILD_SLOW_PATH */ /* @@ -279,6 +317,57 @@ efault: return false; } +/* + * Updates CPU ID, Node ID and MM CID and reads the critical section + * address, when @csaddr != NULL. This allows to put the ID update and the + * read under the same uaccess region to spare a separate begin/end. + * + * As this is either invoked from a C wrapper with @csaddr = NULL or from + * the fast path code with a valid pointer, a clever compiler should be + * able to optimize the read out. Spares a duplicate implementation. + * + * Returns true, if the operation was successful, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when invalid data + * was found on debug kernels. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ +static rseq_inline +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, + u32 node_id, u64 *csaddr) +{ + struct rseq __user *rseq = t->rseq.usrptr; + + if (static_branch_unlikely(&rseq_debug_enabled)) { + if (!rseq_debug_validate_ids(t)) + return false; + } + + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); + unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); + unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); + if (csaddr) + unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); + } + + /* Cache the new values */ + t->rseq.ids.cpu_cid = ids->cpu_cid; + rseq_stat_inc(rseq_stats.ids); + rseq_trace_update(t, ids); + return true; +efault: + return false; +} + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; diff --git a/include/linux/sched.h b/include/linux/sched.h index 24a9da7ca3e7..e47abc8685d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -1408,15 +1407,6 @@ struct task_struct { #endif /* CONFIG_NUMA_BALANCING */ struct rseq_data rseq; -#ifdef CONFIG_DEBUG_RSEQ - /* - * This is a place holder to save a copy of the rseq fields for - * validation of read-only fields. The struct rseq has a - * variable-length array at the end, so it cannot be used - * directly. Reserve a size large enough for the known fields. - */ - char rseq_fields[sizeof(struct rseq)]; -#endif #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 97631554ae96..1e4f1d2cdfe5 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -88,13 +88,6 @@ # define RSEQ_EVENT_GUARD preempt #endif -/* The original rseq structure size (including padding) is 32 bytes. */ -#define ORIG_RSEQ_SIZE 32 - -#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) - DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); static inline void rseq_control_debug(bool on) @@ -227,159 +220,9 @@ static int __init rseq_debugfs_init(void) __initcall(rseq_debugfs_init); #endif /* CONFIG_DEBUG_FS */ -#ifdef CONFIG_DEBUG_RSEQ -static struct rseq *rseq_kernel_fields(struct task_struct *t) -{ - return (struct rseq *) t->rseq_fields; -} - -static int rseq_validate_ro_fields(struct task_struct *t) -{ - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - u32 cpu_id_start, cpu_id, node_id, mm_cid; - struct rseq __user *rseq = t->rseq.usrptr; - - /* - * Validate fields which are required to be read-only by - * user-space. - */ - if (!user_read_access_begin(rseq, t->rseq.len)) - goto efault; - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_get_user(node_id, &rseq->node_id, efault_end); - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); - user_read_access_end(); - - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || - cpu_id != rseq_kernel_fields(t)->cpu_id || - node_id != rseq_kernel_fields(t)->node_id || - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { - - pr_warn("Detected rseq corruption for pid: %d, name: %s\n" - "\tcpu_id_start: %u ?= %u\n" - "\tcpu_id: %u ?= %u\n" - "\tnode_id: %u ?= %u\n" - "\tmm_cid: %u ?= %u\n", - t->pid, t->comm, - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, - cpu_id, rseq_kernel_fields(t)->cpu_id, - node_id, rseq_kernel_fields(t)->node_id, - mm_cid, rseq_kernel_fields(t)->mm_cid); - } - - /* For now, only print a console warning on mismatch. */ - return 0; - -efault_end: - user_read_access_end(); -efault: - return -EFAULT; -} - -/* - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent - * state. - */ -#define rseq_unsafe_put_user(t, value, field, error_label) \ - do { \ - unsafe_put_user(value, &t->rseq.usrptr->field, error_label); \ - rseq_kernel_fields(t)->field = value; \ - } while (0) - -#else -static int rseq_validate_ro_fields(struct task_struct *t) -{ - return 0; -} - -#define rseq_unsafe_put_user(t, value, field, error_label) \ - unsafe_put_user(value, &t->rseq.usrptr->field, error_label) -#endif - -static int rseq_update_cpu_node_id(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id = raw_smp_processor_id(); - u32 node_id = cpu_to_node(cpu_id); - u32 mm_cid = task_mm_cid(t); - - rseq_stat_inc(rseq_stats.ids); - - /* Validate read-only rseq fields on debug kernels */ - if (rseq_validate_ro_fields(t)) - goto efault; - WARN_ON_ONCE((int) mm_cid < 0); - - if (!user_write_access_begin(rseq, t->rseq.len)) - goto efault; - - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); - - /* Cache the user space values */ - t->rseq.ids.cpu_id = cpu_id; - t->rseq.ids.mm_cid = mm_cid; - - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally updated only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - trace_rseq_update(t); - return 0; - -efault_end: - user_write_access_end(); -efault: - return -EFAULT; -} - -static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) +static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, - mm_cid = 0; - - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; - - if (!user_write_access_begin(rseq, t->rseq.len)) - goto efault; - - /* - * Reset all fields to their initial state. - * - * All fields have an initial state of 0 except cpu_id which is set to - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after - * unregistration can figure out that rseq needs to be registered - * again. - */ - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); - - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally reset only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - return 0; - -efault_end: - user_write_access_end(); -efault: - return -EFAULT; + return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); } static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) @@ -410,6 +253,8 @@ efault: void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; + struct rseq_ids ids; + u32 node_id; bool event; int sig; @@ -456,6 +301,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) scoped_guard(RSEQ_EVENT_GUARD) { event = t->rseq.event.sched_switch; t->rseq.event.sched_switch = false; + ids.cpu_id = task_cpu(t); + ids.mm_cid = task_mm_cid(t); } if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) @@ -464,7 +311,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (!rseq_handle_cs(t, regs)) goto error; - if (unlikely(rseq_update_cpu_node_id(t))) + node_id = cpu_to_node(ids.cpu_id); + if (!rseq_set_ids(t, &ids, node_id)) goto error; return; @@ -504,13 +352,33 @@ void rseq_syscall(struct pt_regs *regs) } #endif +static bool rseq_reset_ids(void) +{ + struct rseq_ids ids = { + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, + .mm_cid = 0, + }; + + /* + * If this fails, terminate it because this leaves the kernel in + * stupid state as exit to user space will try to fixup the ids + * again. + */ + if (rseq_set_ids(current, &ids, 0)) + return true; + + force_sig(SIGSEGV); + return false; +} + +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + /* * sys_rseq - setup restartable sequences for caller thread. */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { - int ret; - if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; @@ -521,9 +389,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return -EINVAL; if (current->rseq.sig != sig) return -EPERM; - ret = rseq_reset_rseq_cpu_node_id(current); - if (ret) - return ret; + if (!rseq_reset_ids()) + return -EFAULT; rseq_reset(current); return 0; } @@ -563,27 +430,22 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; - /* - * If the rseq_cs pointer is non-NULL on registration, clear it to - * avoid a potential segfault on return to user-space. The proper thing - * to do would have been to fail the registration but this would break - * older libcs that reuse the rseq area for new threads without - * clearing the fields. Don't bother reading it, just reset it. - */ - if (put_user(0UL, &rseq->rseq_cs)) - return -EFAULT; + scoped_user_write_access(rseq, efault) { + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. Don't bother reading it, just reset it. + */ + unsafe_put_user(0UL, &rseq->rseq_cs, efault); + /* Initialize IDs in user space */ + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0U, &rseq->node_id, efault); + unsafe_put_user(0U, &rseq->mm_cid, efault); + } -#ifdef CONFIG_DEBUG_RSEQ - /* - * Initialize the in-kernel rseq fields copy for validation of - * read-only fields. - */ - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) - return -EFAULT; -#endif /* * Activate the registration by setting the rseq area address, length * and signature in the task struct. @@ -599,6 +461,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 */ current->rseq.event.has_rseq = true; rseq_sched_switch_event(current); - return 0; + +efault: + return -EFAULT; } -- cgit v1.2.3 From 9f6ffd4cebda86841700775de3213f22bb0ea22d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:10 +0100 Subject: rseq: Separate the signal delivery path Completely separate the signal delivery path from the notify handler as they have different semantics versus the event handling. The signal delivery only needs to ensure that the interrupted user context was not in a critical section or the section is aborted before it switches to the signal frame context. The signal frame context does not have the original instruction pointer anymore, so that can't be handled on exit to user space. No point in updating the CPU/CID ids as they might change again before the task returns to user space for real. The fast path optimization, which checks for the 'entry from user via interrupt' condition is only available for architectures which use the generic entry code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.455429038@linutronix.de --- include/linux/rseq.h | 21 ++++++++++++++++----- kernel/rseq.c | 30 ++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 92f9cd49489b..f5a43188023f 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,22 +7,33 @@ #include -void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); +void __rseq_handle_notify_resume(struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) { if (current->rseq.event.has_rseq) - __rseq_handle_notify_resume(NULL, regs); + __rseq_handle_notify_resume(regs); } +void __rseq_signal_deliver(int sig, struct pt_regs *regs); + +/* + * Invoked from signal delivery to fixup based on the register context before + * switching to the signal delivery context. + */ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (current->rseq.event.has_rseq) { - current->rseq.event.sched_switch = true; - __rseq_handle_notify_resume(ksig, regs); + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.has_rseq & current->rseq.event.user_irq) + __rseq_signal_deliver(ksig->sig, regs); + } else { + if (current->rseq.event.has_rseq) + __rseq_signal_deliver(ksig->sig, regs); } } +/* Raised from context switch and exevce to force evaluation on exit to user */ static inline void rseq_sched_switch_event(struct task_struct *t) { if (t->rseq.event.has_rseq) { diff --git a/kernel/rseq.c b/kernel/rseq.c index 1e4f1d2cdfe5..13faadc737ad 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -250,13 +250,12 @@ efault: * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. */ -void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) +void __rseq_handle_notify_resume(struct pt_regs *regs) { struct task_struct *t = current; struct rseq_ids ids; u32 node_id; bool event; - int sig; /* * If invoked from hypervisors before entering the guest via @@ -275,10 +274,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; - if (ksig) - rseq_stat_inc(rseq_stats.signal); - else - rseq_stat_inc(rseq_stats.slowpath); + rseq_stat_inc(rseq_stats.slowpath); /* * Read and clear the event pending bit first. If the task @@ -317,8 +313,26 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) return; error: - sig = ksig ? ksig->sig : 0; - force_sigsegv(sig); + force_sig(SIGSEGV); +} + +void __rseq_signal_deliver(int sig, struct pt_regs *regs) +{ + rseq_stat_inc(rseq_stats.signal); + /* + * Don't update IDs, they are handled on exit to user if + * necessary. The important thing is to abort a critical section of + * the interrupted context as after this point the instruction + * pointer in @regs points to the signal handler. + */ + if (unlikely(!rseq_handle_cs(current, regs))) { + /* + * Clear the errors just in case this might survive + * magically, but leave the rest intact. + */ + current->rseq.event.error = 0; + force_sigsegv(sig); + } } /* -- cgit v1.2.3 From e2d4f42271155045a49b89530f2c06ad8e9f1a1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:12 +0100 Subject: rseq: Rework the TIF_NOTIFY handler Replace the whole logic with a new implementation, which is shared with signal delivery and the upcoming exit fast path. Contrary to the original implementation, this ignores invocations from KVM/IO-uring, which invoke resume_user_mode_work() with the @regs argument set to NULL. The original implementation updated the CPU/Node/MM CID fields, but that was just a side effect, which was addressing the problem that this invocation cleared TIF_NOTIFY_RESUME, which in turn could cause an update on return to user space to be lost. This problem has been addressed differently, so that it's not longer required to do that update before entering the guest. That might be considered a user visible change, when the hosts thread TLS memory is mapped into the guest, but as this was never intentionally supported, this abuse of kernel internal implementation details is not considered an ABI break. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.517640811@linutronix.de --- include/linux/rseq_entry.h | 29 ++++++++++++++++++ kernel/rseq.c | 76 ++++++++++++++++++++-------------------------- 2 files changed, 62 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 37444e80fd45..aa1c0464a16c 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -368,6 +368,35 @@ efault: return false; } +/* + * Update user space with new IDs and conditionally check whether the task + * is in a critical section. + */ +static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, + struct rseq_ids *ids, u32 node_id) +{ + u64 csaddr; + + if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) + return false; + + /* + * On architectures which utilize the generic entry code this + * allows to skip the critical section when the entry was not from + * a user space interrupt, unless debug mode is enabled. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + if (!static_branch_unlikely(&rseq_debug_enabled)) { + if (likely(!t->rseq.event.user_irq)) + return true; + } + } + if (likely(!csaddr)) + return true; + /* Sigh, this really needs to do work */ + return rseq_update_user_cs(t, regs, csaddr); +} + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; diff --git a/kernel/rseq.c b/kernel/rseq.c index 13faadc737ad..148fb2103023 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -82,12 +82,6 @@ #define CREATE_TRACE_POINTS #include -#ifdef CONFIG_MEMBARRIER -# define RSEQ_EVENT_GUARD irq -#else -# define RSEQ_EVENT_GUARD preempt -#endif - DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); static inline void rseq_control_debug(bool on) @@ -239,38 +233,15 @@ efault: return false; } -/* - * This resume handler must always be executed between any of: - * - preemption, - * - signal delivery, - * and return to user-space. - * - * This is how we can ensure that the entire rseq critical section - * will issue the commit instruction only if executed atomically with - * respect to other threads scheduled on the same CPU, and with respect - * to signal handlers. - */ -void __rseq_handle_notify_resume(struct pt_regs *regs) +static void rseq_slowpath_update_usr(struct pt_regs *regs) { + /* Preserve rseq state and user_irq state for exit to user */ + const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; struct task_struct *t = current; struct rseq_ids ids; u32 node_id; bool event; - /* - * If invoked from hypervisors before entering the guest via - * resume_user_mode_work(), then @regs is a NULL pointer. - * - * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises - * it before returning from the ioctl() to user space when - * rseq_event.sched_switch is set. - * - * So it's safe to ignore here instead of pointlessly updating it - * in the vcpu_run() loop. - */ - if (!regs) - return; - if (unlikely(t->flags & PF_EXITING)) return; @@ -294,26 +265,45 @@ void __rseq_handle_notify_resume(struct pt_regs *regs) * with the result handed in to allow the detection of * inconsistencies. */ - scoped_guard(RSEQ_EVENT_GUARD) { + scoped_guard(irq) { event = t->rseq.event.sched_switch; - t->rseq.event.sched_switch = false; + t->rseq.event.all &= evt_mask.all; ids.cpu_id = task_cpu(t); ids.mm_cid = task_mm_cid(t); } - if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) + if (!event) return; - if (!rseq_handle_cs(t, regs)) - goto error; - node_id = cpu_to_node(ids.cpu_id); - if (!rseq_set_ids(t, &ids, node_id)) - goto error; - return; -error: - force_sig(SIGSEGV); + if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + /* + * Clear the errors just in case this might survive magically, but + * leave the rest intact. + */ + t->rseq.event.error = 0; + force_sig(SIGSEGV); + } +} + +void __rseq_handle_notify_resume(struct pt_regs *regs) +{ + /* + * If invoked from hypervisors before entering the guest via + * resume_user_mode_work(), then @regs is a NULL pointer. + * + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises + * it before returning from the ioctl() to user space when + * rseq_event.sched_switch is set. + * + * So it's safe to ignore here instead of pointlessly updating it + * in the vcpu_run() loop. + */ + if (!regs) + return; + + rseq_slowpath_update_usr(regs); } void __rseq_signal_deliver(int sig, struct pt_regs *regs) -- cgit v1.2.3 From 39a167560a61f913560ba803a96dbe6c15239f5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:14 +0100 Subject: rseq: Optimize event setting After removing the various condition bits earlier it turns out that one extra information is needed to avoid setting event::sched_switch and TIF_NOTIFY_RESUME unconditionally on every context switch. The update of the RSEQ user space memory is only required, when either the task was interrupted in user space and schedules or the CPU or MM CID changes in schedule() independent of the entry mode Right now only the interrupt from user information is available. Add an event flag, which is set when the CPU or MM CID or both change. Evaluate this event in the scheduler to decide whether the sched_switch event and the TIF bit need to be set. It's an extra conditional in context_switch(), but the downside of unconditionally handling RSEQ after a context switch to user is way more significant. The utilized boolean logic minimizes this to a single conditional branch. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.578058898@linutronix.de --- fs/exec.c | 2 +- include/linux/rseq.h | 81 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/rseq_types.h | 11 +++++-- kernel/rseq.c | 2 +- kernel/sched/core.c | 7 +++- kernel/sched/sched.h | 5 ++- 6 files changed, 95 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index e45b29890269..90e47eb156ab 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1775,7 +1775,7 @@ out: force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); - rseq_sched_switch_event(current); + rseq_force_update(); current->in_execve = 0; return retval; diff --git a/include/linux/rseq.h b/include/linux/rseq.h index f5a43188023f..abfbeb42d1a2 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -11,7 +11,8 @@ void __rseq_handle_notify_resume(struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) { - if (current->rseq.event.has_rseq) + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) __rseq_handle_notify_resume(regs); } @@ -33,12 +34,75 @@ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *reg } } -/* Raised from context switch and exevce to force evaluation on exit to user */ -static inline void rseq_sched_switch_event(struct task_struct *t) +static inline void rseq_raise_notify_resume(struct task_struct *t) { - if (t->rseq.event.has_rseq) { - t->rseq.event.sched_switch = true; - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); +} + +/* Invoked from context switch to force evaluation on exit to user */ +static __always_inline void rseq_sched_switch_event(struct task_struct *t) +{ + struct rseq_event *ev = &t->rseq.event; + + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* + * Avoid a boat load of conditionals by using simple logic + * to determine whether NOTIFY_RESUME needs to be raised. + * + * It's required when the CPU or MM CID has changed or + * the entry was from user space. + */ + bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; + + if (raise) { + ev->sched_switch = true; + rseq_raise_notify_resume(t); + } + } else { + if (ev->has_rseq) { + t->rseq.event.sched_switch = true; + rseq_raise_notify_resume(t); + } + } +} + +/* + * Invoked from __set_task_cpu() when a task migrates to enforce an IDs + * update. + * + * This does not raise TIF_NOTIFY_RESUME as that happens in + * rseq_sched_switch_event(). + */ +static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) +{ + t->rseq.event.ids_changed = true; +} + +/* + * Invoked from switch_mm_cid() in context switch when the task gets a MM + * CID assigned. + * + * This does not raise TIF_NOTIFY_RESUME as that happens in + * rseq_sched_switch_event(). + */ +static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) +{ + /* + * Requires a comparison as the switch_mm_cid() code does not + * provide a conditional for it readily. So avoid excessive updates + * when nothing changes. + */ + if (t->rseq.ids.mm_cid != cid) + t->rseq.event.ids_changed = true; +} + +/* Enforce a full update after RSEQ registration and when execve() failed */ +static inline void rseq_force_update(void) +{ + if (current->rseq.event.has_rseq) { + current->rseq.event.ids_changed = true; + current->rseq.event.sched_switch = true; + rseq_raise_notify_resume(current); } } @@ -55,7 +119,7 @@ static inline void rseq_sched_switch_event(struct task_struct *t) static inline void rseq_virt_userspace_exit(void) { if (current->rseq.event.sched_switch) - set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + rseq_raise_notify_resume(current); } static inline void rseq_reset(struct task_struct *t) @@ -91,6 +155,9 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } +static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { } +static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { } +static inline void rseq_force_update(void) { } static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 7c123947bb98..a1389fff4fca 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -11,20 +11,27 @@ struct rseq; * struct rseq_event - Storage for rseq related event management * @all: Compound to initialize and clear the data efficiently * @events: Compound to access events with a single load/store - * @sched_switch: True if the task was scheduled out + * @sched_switch: True if the task was scheduled and needs update on + * exit to user + * @ids_changed: Indicator that IDs need to be updated * @user_irq: True on interrupt entry from user mode * @has_rseq: True if the task has a rseq pointer installed * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid + * + * @sched_switch and @ids_changed must be adjacent and the combo must be + * 16bit aligned to allow a single store, when both are set at the same + * time in the scheduler. */ struct rseq_event { union { u64 all; struct { union { - u16 events; + u32 events; struct { u8 sched_switch; + u8 ids_changed; u8 user_irq; }; }; diff --git a/kernel/rseq.c b/kernel/rseq.c index 148fb2103023..183dde756808 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -464,7 +464,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * are updated before returning to user-space. */ current->rseq.event.has_rseq = true; - rseq_sched_switch_event(current); + rseq_force_update(); return 0; efault: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b75e8e1eca4a..579a8e93578f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5118,7 +5118,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); - rseq_sched_switch_event(prev); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -5316,6 +5315,12 @@ context_switch(struct rq *rq, struct task_struct *prev, /* switch_mm_cid() requires the memory barriers above. */ switch_mm_cid(rq, prev, next); + /* + * Tell rseq that the task was scheduled in. Must be after + * switch_mm_cid() to get the TIF flag set. + */ + rseq_sched_switch_event(next); + prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index adfb6e3409d7..4838dda75b10 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2209,6 +2209,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; + rseq_sched_set_task_cpu(p, cpu); #endif /* CONFIG_SMP */ } @@ -3807,8 +3808,10 @@ static inline void switch_mm_cid(struct rq *rq, mm_cid_put_lazy(prev); prev->mm_cid = -1; } - if (next->mm_cid_active) + if (next->mm_cid_active) { next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); + rseq_sched_set_task_mm_cid(next, next->mm_cid); + } } #else /* !CONFIG_SCHED_MM_CID: */ -- cgit v1.2.3 From 05b44aef709cae5e4274590f050cf35049dcc24e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:17 +0100 Subject: rseq: Implement fast path for exit to user Implement the actual logic for handling RSEQ updates in a fast path after handling the TIF work and at the point where the task is actually returning to user space. This is the right point to do that because at this point the CPU and the MM CID are stable and cannot longer change due to yet another reschedule. That happens when the task is handling it via TIF_NOTIFY_RESUME in resume_user_mode_work(), which is invoked from the exit to user mode work loop. The function is invoked after the TIF work is handled and runs with interrupts disabled, which means it cannot resolve page faults. It therefore disables page faults and in case the access to the user space memory faults, it: - notes the fail in the event struct - raises TIF_NOTIFY_RESUME - returns false to the caller The caller has to go back to the TIF work, which runs with interrupts enabled and therefore can resolve the page faults. This happens mostly on fork() when the memory is marked COW. If the user memory inspection finds invalid data, the function returns false as well and sets the fatal flag in the event struct along with TIF_NOTIFY_RESUME. The slow path notify handler has to evaluate that flag and terminate the task with SIGSEGV as documented. The initial decision to invoke any of this is based on one flags in the event struct: @sched_switch. The decision is in pseudo ASM: load tsk::event::sched_switch jnz inspect_user_space mov $0, tsk::event::events ... leave So for the common case where the task was not scheduled out, this really boils down to three instructions before going out if the compiler is not completely stupid (and yes, some of them are). If the condition is true, then it checks, whether CPU ID or MM CID have changed. If so, then the CPU/MM IDs have to be updated and are thereby cached for the next round. The update unconditionally retrieves the user space critical section address to spare another user*begin/end() pair. If that's not zero and tsk::event::user_irq is set, then the critical section is analyzed and acted upon. If either zero or the entry came via syscall the critical section analysis is skipped. If the comparison is false then the critical section has to be analyzed because the event flag is then only true when entry from user was by interrupt. This is provided without the actual hookup to let reviewers focus on the implementation details. The hookup happens in the next step. Note: As with quite some other optimizations this depends on the generic entry infrastructure and is not enabled to be sucked into random architecture implementations. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.638929615@linutronix.de --- include/linux/rseq_entry.h | 133 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/rseq_types.h | 3 + kernel/rseq.c | 2 + 3 files changed, 135 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index aa1c0464a16c..3f13be7301fa 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -10,6 +10,7 @@ struct rseq_stats { unsigned long exit; unsigned long signal; unsigned long slowpath; + unsigned long fastpath; unsigned long ids; unsigned long cs; unsigned long clear; @@ -245,12 +246,13 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c { struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; unsigned long ip = instruction_pointer(regs); + unsigned long tasksize = TASK_SIZE; u64 start_ip, abort_ip, offset; u32 usig, __user *uc_sig; rseq_stat_inc(rseq_stats.cs); - if (unlikely(csaddr >= TASK_SIZE)) { + if (unlikely(csaddr >= tasksize)) { t->rseq.event.fatal = true; return false; } @@ -287,7 +289,7 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP * protection. */ - if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig)) + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) goto die; /* The address is guaranteed to be >= 0 and < TASK_SIZE */ @@ -397,6 +399,128 @@ static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *r return rseq_update_user_cs(t, regs, csaddr); } +/* + * If you want to use this then convert your architecture to the generic + * entry code. I'm tired of building workarounds for people who can't be + * bothered to make the maintenance of generic infrastructure less + * burdensome. Just sucking everything into the architecture code and + * thereby making others chase the horrible hacks and keep them working is + * neither acceptable nor sustainable. + */ +#ifdef CONFIG_GENERIC_ENTRY + +/* + * This is inlined into the exit path because: + * + * 1) It's a one time comparison in the fast path when there is no event to + * handle + * + * 2) The access to the user space rseq memory (TLS) is unlikely to fault + * so the straight inline operation is: + * + * - Four 32-bit stores only if CPU ID/ MM CID need to be updated + * - One 64-bit load to retrieve the critical section address + * + * 3) In the unlikely case that the critical section address is != NULL: + * + * - One 64-bit load to retrieve the start IP + * - One 64-bit load to retrieve the offset for calculating the end + * - One 64-bit load to retrieve the abort IP + * - One 64-bit load to retrieve the signature + * - One store to clear the critical section address + * + * The non-debug case implements only the minimal required checking. It + * provides protection against a rogue abort IP in kernel space, which + * would be exploitable at least on x86, and also against a rogue CS + * descriptor by checking the signature at the abort IP. Any fallout from + * invalid critical section descriptors is a user space problem. The debug + * case provides the full set of checks and terminates the task if a + * condition is not met. + * + * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and + * tells the caller to loop back into exit_to_user_mode_loop(). The rseq + * slow path there will handle the failure. + */ +static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) +{ + /* + * Page faults need to be disabled as this is called with + * interrupts disabled + */ + guard(pagefault)(); + if (likely(!t->rseq.event.ids_changed)) { + struct rseq __user *rseq = t->rseq.usrptr; + /* + * If IDs have not changed rseq_event::user_irq must be true + * See rseq_sched_switch_event(). + */ + u64 csaddr; + + if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) + return false; + + if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { + if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) + return false; + } + return true; + } + + struct rseq_ids ids = { + .cpu_id = task_cpu(t), + .mm_cid = task_mm_cid(t), + }; + u32 node_id = cpu_to_node(ids.cpu_id); + + return rseq_update_usr(t, regs, &ids, node_id); +} + +static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + struct task_struct *t = current; + + /* + * If the task did not go through schedule or got the flag enforced + * by the rseq syscall or execve, then nothing to do here. + * + * CPU ID and MM CID can only change when going through a context + * switch. + * + * rseq_sched_switch_event() sets the rseq_event::sched_switch bit + * only when rseq_event::has_rseq is true. That conditional is + * required to avoid setting the TIF bit if RSEQ is not registered + * for a task. rseq_event::sched_switch is cleared when RSEQ is + * unregistered by a task so it's sufficient to check for the + * sched_switch bit alone. + * + * A sane compiler requires three instructions for the nothing to do + * case including clearing the events, but your mileage might vary. + */ + if (unlikely((t->rseq.event.sched_switch))) { + rseq_stat_inc(rseq_stats.fastpath); + + if (unlikely(!rseq_exit_user_update(regs, t))) + return true; + } + /* Clear state so next entry starts from a clean slate */ + t->rseq.event.events = 0; + return false; +} + +static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { + current->rseq.event.slowpath = true; + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + return true; + } + return false; +} + +#else /* CONFIG_GENERIC_ENTRY */ +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } +#endif /* !CONFIG_GENERIC_ENTRY */ + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; @@ -421,9 +545,12 @@ static inline void rseq_debug_syscall_return(struct pt_regs *regs) if (static_branch_unlikely(&rseq_debug_enabled)) __rseq_debug_syscall_return(regs); } - #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + return false; +} static inline void rseq_exit_to_user_mode(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } #endif /* !CONFIG_RSEQ */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index a1389fff4fca..9c7a34154de8 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -18,6 +18,8 @@ struct rseq; * @has_rseq: True if the task has a rseq pointer installed * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid + * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME + * is required * * @sched_switch and @ids_changed must be adjacent and the combo must be * 16bit aligned to allow a single store, when both are set at the same @@ -42,6 +44,7 @@ struct rseq_event { u16 error; struct { u8 fatal; + u8 slowpath; }; }; }; diff --git a/kernel/rseq.c b/kernel/rseq.c index 183dde756808..c5d6336c6956 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -133,6 +133,7 @@ static int rseq_stats_show(struct seq_file *m, void *p) stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); + stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); @@ -142,6 +143,7 @@ static int rseq_stats_show(struct seq_file *m, void *p) seq_printf(m, "exit: %16lu\n", stats.exit); seq_printf(m, "signal: %16lu\n", stats.signal); seq_printf(m, "slowp: %16lu\n", stats.slowpath); + seq_printf(m, "fastp: %16lu\n", stats.fastpath); seq_printf(m, "ids: %16lu\n", stats.ids); seq_printf(m, "cs: %16lu\n", stats.cs); seq_printf(m, "clear: %16lu\n", stats.clear); -- cgit v1.2.3 From 3db6b38dfe640207da706b286d4181237391f5bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:19 +0100 Subject: rseq: Switch to fast path processing on exit to user Now that all bits and pieces are in place, hook the RSEQ handling fast path function into exit_to_user_mode_prepare() after the TIF work bits have been handled. If case of fast path failure, TIF_NOTIFY_RESUME has been raised and the caller needs to take another turn through the TIF handling slow path. This only works for architectures which use the generic entry code. Architectures who still have their own incomplete hacks are not supported and won't be. This results in the following improvements: Kernel build Before After Reduction exit to user 80692981 80514451 signal checks: 32581 121 99% slowpath runs: 1201408 1.49% 198 0.00% 100% fastpath runs: 675941 0.84% N/A id updates: 1233989 1.53% 50541 0.06% 96% cs checks: 1125366 1.39% 0 0.00% 100% cs cleared: 1125366 100% 0 100% cs fixup: 0 0% 0 RSEQ selftests Before After Reduction exit to user: 386281778 387373750 signal checks: 35661203 0 100% slowpath runs: 140542396 36.38% 100 0.00% 100% fastpath runs: 9509789 2.51% N/A id updates: 176203599 45.62% 9087994 2.35% 95% cs checks: 175587856 45.46% 4728394 1.22% 98% cs cleared: 172359544 98.16% 1319307 27.90% 99% cs fixup: 3228312 1.84% 3409087 72.10% The 'cs cleared' and 'cs fixup' percentages are not relative to the exit to user invocations, they are relative to the actual 'cs check' invocations. While some of this could have been avoided in the original code, like the obvious clearing of CS when it's already clear, the main problem of going through TIF_NOTIFY_RESUME cannot be solved. In some workloads the RSEQ notify handler is invoked more than once before going out to user space. Doing this once when everything has stabilized is the only solution to avoid this. The initial attempt to completely decouple it from the TIF work turned out to be suboptimal for workloads, which do a lot of quick and short system calls. Even if the fast path decision is only 4 instructions (including a conditional branch), this adds up quickly and becomes measurable when the rate for actually having to handle rseq is in the low single digit percentage range of user/kernel transitions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.701201365@linutronix.de --- include/linux/irq-entry-common.h | 7 ++----- include/linux/resume_user_mode.h | 2 +- include/linux/rseq.h | 18 ++++++++++++------ init/Kconfig | 2 +- kernel/entry/common.c | 26 +++++++++++++++++++------- kernel/rseq.c | 8 ++++++-- 6 files changed, 41 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index cb31fb84d7b4..8f5ceeaaaea5 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -197,11 +197,8 @@ static __always_inline void arch_exit_to_user_mode(void) { } */ void arch_do_signal_or_restart(struct pt_regs *regs); -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - */ -unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work); +/* Handle pending TIF work */ +unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); /** * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index dd3bf7da90a8..bf92227c78d0 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); - rseq_handle_notify_resume(regs); + rseq_handle_slowpath(regs); } #endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index abfbeb42d1a2..ded4baa34586 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,13 +7,19 @@ #include -void __rseq_handle_notify_resume(struct pt_regs *regs); +void __rseq_handle_slowpath(struct pt_regs *regs); -static inline void rseq_handle_notify_resume(struct pt_regs *regs) +/* Invoked from resume_user_mode_work() */ +static inline void rseq_handle_slowpath(struct pt_regs *regs) { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) - __rseq_handle_notify_resume(regs); + if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) { + if (current->rseq.event.slowpath) + __rseq_handle_slowpath(regs); + } else { + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) + __rseq_handle_slowpath(regs); + } } void __rseq_signal_deliver(int sig, struct pt_regs *regs); @@ -152,7 +158,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) } #else /* CONFIG_RSEQ */ -static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } +static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { } diff --git a/init/Kconfig b/init/Kconfig index bde40ab664e2..d1c606ec632e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1941,7 +1941,7 @@ config RSEQ_DEBUG_DEFAULT_ENABLE config DEBUG_RSEQ default n bool "Enable debugging of rseq() system call" if EXPERT - depends on RSEQ && DEBUG_KERNEL + depends on RSEQ && DEBUG_KERNEL && !GENERIC_ENTRY select RSEQ_DEBUG_DEFAULT_ENABLE help Enable extra debugging checks for the rseq system call. diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 70a16db4cc0a..523a3e758af4 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -11,13 +11,8 @@ /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - * @regs: Pointer to pt_regs on entry stack - * @ti_work: TIF work flags as read by the caller - */ -__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work @@ -62,6 +57,23 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller + */ +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) +{ + for (;;) { + ti_work = __exit_to_user_mode_loop(regs, ti_work); + + if (likely(!rseq_exit_to_user_mode_restart(regs))) + return ti_work; + ti_work = read_thread_flags(); + } +} + noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) { irqentry_state_t ret = { diff --git a/kernel/rseq.c b/kernel/rseq.c index c5d6336c6956..395d8b002350 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -237,7 +237,11 @@ efault: static void rseq_slowpath_update_usr(struct pt_regs *regs) { - /* Preserve rseq state and user_irq state for exit to user */ + /* + * Preserve rseq state and user_irq state. The generic entry code + * clears user_irq on the way out, the non-generic entry + * architectures are not having user_irq. + */ const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; struct task_struct *t = current; struct rseq_ids ids; @@ -289,7 +293,7 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs) } } -void __rseq_handle_notify_resume(struct pt_regs *regs) +void __rseq_handle_slowpath(struct pt_regs *regs) { /* * If invoked from hypervisors before entering the guest via -- cgit v1.2.3 From 70fe25a3bc53a891f0e6184c12bd55cc524cb13b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:21 +0100 Subject: entry: Split up exit_to_user_mode_prepare() exit_to_user_mode_prepare() is used for both interrupts and syscalls, but there is extra rseq work, which is only required for in the interrupt exit case. Split up the function and provide wrappers for syscalls and interrupts, which allows to separate the rseq exit work in the next step. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.782234789@linutronix.de --- arch/arm64/kernel/entry-common.c | 2 +- include/linux/entry-common.h | 2 +- include/linux/irq-entry-common.h | 49 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 46 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index a9c81715ce59..0a97e2621f60 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -100,7 +100,7 @@ static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { local_irq_disable(); - exit_to_user_mode_prepare(regs); + exit_to_user_mode_prepare_legacy(regs); local_daif_mask(); mte_check_tfsr_exit(); exit_to_user_mode(); diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index d967184ae08f..87efb38b7081 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -156,7 +156,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) if (unlikely(work & SYSCALL_WORK_EXIT)) syscall_exit_work(regs, work); local_irq_disable_exit_to_user(); - exit_to_user_mode_prepare(regs); + syscall_exit_to_user_mode_prepare(regs); } /** diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 8f5ceeaaaea5..5ea61722bb70 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -201,7 +201,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs); unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); /** - * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack * * 1) check that interrupts are disabled @@ -209,8 +209,10 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work * 3) call exit_to_user_mode_loop() if any flags from * EXIT_TO_USER_MODE_WORK are set * 4) check that interrupts are still disabled + * + * Don't invoke directly, use the syscall/irqentry_ prefixed variants below */ -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) { unsigned long ti_work; @@ -224,15 +226,52 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) ti_work = exit_to_user_mode_loop(regs, ti_work); arch_exit_to_user_mode_prepare(regs, ti_work); +} - rseq_exit_to_user_mode(); - +static __always_inline void __exit_to_user_mode_validate(void) +{ /* Ensure that kernel state is sane for a return to userspace */ kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); } +/* Temporary workaround to keep ARM64 alive */ +static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + +/** + * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for + * syscalls and interrupts. + */ +static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + +/** + * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for + * syscalls and interrupts. + */ +static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * @@ -297,7 +336,7 @@ static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) { instrumentation_begin(); - exit_to_user_mode_prepare(regs); + irqentry_exit_to_user_mode_prepare(regs); instrumentation_end(); exit_to_user_mode(); } -- cgit v1.2.3 From 7a5201ea1907534efe3a6e9c001ef4c0257cb3f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:24 +0100 Subject: rseq: Split up rseq_exit_to_user_mode() Separate the interrupt and syscall exit handling. Syscall exit does not require to clear the user_irq bit as it can't be set. On interrupt exit it can be set when the interrupt did not result in a scheduling event and therefore the return path did not invoke the TIF work handling, which would have cleared it. The debug check for the event state is also not really required even when debug mode is enabled via the static key. Debug mode is largely aiding user space by enabling a larger amount of validation checks, which cause a segfault when a malformed critical section is detected. In production mode the critical section handling takes the content mostly as is and lets user space keep the pieces when it screwed up. On kernel changes in that area the state check is useful, but that can be done when lockdep is enabled, which is anyway a required test scenario for fundamental changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.842785700@linutronix.de --- include/linux/irq-entry-common.h | 6 +++--- include/linux/rseq_entry.h | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 5ea61722bb70..bc5d178e0b91 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -240,7 +240,7 @@ static __always_inline void __exit_to_user_mode_validate(void) static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) { __exit_to_user_mode_prepare(regs); - rseq_exit_to_user_mode(); + rseq_exit_to_user_mode_legacy(); __exit_to_user_mode_validate(); } @@ -254,7 +254,7 @@ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *reg static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { __exit_to_user_mode_prepare(regs); - rseq_exit_to_user_mode(); + rseq_syscall_exit_to_user_mode(); __exit_to_user_mode_validate(); } @@ -268,7 +268,7 @@ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *re static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) { __exit_to_user_mode_prepare(regs); - rseq_exit_to_user_mode(); + rseq_irqentry_exit_to_user_mode(); __exit_to_user_mode_validate(); } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 3f13be7301fa..958a63eeb2d3 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -521,7 +521,37 @@ static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } #endif /* !CONFIG_GENERIC_ENTRY */ -static __always_inline void rseq_exit_to_user_mode(void) +static __always_inline void rseq_syscall_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + /* Needed to remove the store for the !lockdep case */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + WARN_ON_ONCE(ev->sched_switch); + ev->events = 0; + } +} + +static __always_inline void rseq_irqentry_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + lockdep_assert_once(!ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing could not clear it. + */ + ev->events = 0; +} + +/* Required to keep ARM64 working */ +static __always_inline void rseq_exit_to_user_mode_legacy(void) { struct rseq_event *ev = ¤t->rseq.event; @@ -551,7 +581,9 @@ static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } -static inline void rseq_exit_to_user_mode(void) { } +static inline void rseq_syscall_exit_to_user_mode(void) { } +static inline void rseq_irqentry_exit_to_user_mode(void) { } +static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } #endif /* !CONFIG_RSEQ */ -- cgit v1.2.3 From 32034df66b5f49626aa450ceaf1849a08d87906e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:26 +0100 Subject: rseq: Switch to TIF_RSEQ if supported TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is suboptimal especially with the RSEQ fast path depending on it, but not really handling it. Define a separate TIF_RSEQ in the generic TIF space and enable the full separation of fast and slow path for architectures which utilize that. That avoids the hassle with invocations of resume_user_mode_work() from hypervisors, which clear TIF_NOTIFY_RESUME. It makes the therefore required re-evaluation at the end of vcpu_run() a NOOP on architectures which utilize the generic TIF space and have a separate TIF_RSEQ. The hypervisor TIF handling does not include the separate TIF_RSEQ as there is no point in doing so. The guest does neither know nor care about the VMM host applications RSEQ state. That state is only relevant when the ioctl() returns to user space. The fastpath implementation still utilizes TIF_NOTIFY_RESUME for failure handling, but this only happens within exit_to_user_mode_loop(), so arguably the hypervisor ioctl() code is long done when this happens. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.903622031@linutronix.de --- include/asm-generic/thread_info_tif.h | 3 +++ include/linux/irq-entry-common.h | 2 +- include/linux/rseq.h | 22 +++++++++++++++------- include/linux/rseq_entry.h | 32 +++++++++++++++++++++++++++++--- include/linux/thread_info.h | 5 +++++ kernel/entry/common.c | 10 ++++++++-- 6 files changed, 61 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h index ee3793e9b1a4..da1610a78f92 100644 --- a/include/asm-generic/thread_info_tif.h +++ b/include/asm-generic/thread_info_tif.h @@ -45,4 +45,7 @@ # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #endif +#define TIF_RSEQ 11 // Run RSEQ fast path +#define _TIF_RSEQ BIT(TIF_RSEQ) + #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index bc5d178e0b91..72e3f7a59469 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -30,7 +30,7 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ - _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ ARCH_EXIT_TO_USER_MODE_WORK) /** diff --git a/include/linux/rseq.h b/include/linux/rseq.h index ded4baa34586..b5e4803c4ebe 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -42,7 +42,7 @@ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *reg static inline void rseq_raise_notify_resume(struct task_struct *t) { - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + set_tsk_thread_flag(t, TIF_RSEQ); } /* Invoked from context switch to force evaluation on exit to user */ @@ -114,17 +114,25 @@ static inline void rseq_force_update(void) /* * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, - * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in - * that case just to do it eventually again before returning to user space, - * the entry resume_user_mode_work() invocation is ignored as the register - * argument is NULL. + * which clears TIF_NOTIFY_RESUME on architectures that don't use the + * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag. * - * After returning from guest mode, they have to invoke this function to - * re-raise TIF_NOTIFY_RESUME if necessary. + * To avoid updating user space RSEQ in that case just to do it eventually + * again before returning to user space, because __rseq_handle_slowpath() + * does nothing when invoked with NULL register state. + * + * After returning from guest mode, before exiting to userspace, hypervisors + * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary. */ static inline void rseq_virt_userspace_exit(void) { if (current->rseq.event.sched_switch) + /* + * The generic optimization for deferring RSEQ updates until the next + * exit relies on having a dedicated TIF_RSEQ. + */ + if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && + current->rseq.event.sched_switch) rseq_raise_notify_resume(current); } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 958a63eeb2d3..c92167ff8a7f 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -507,18 +507,44 @@ static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *reg return false; } -static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +static __always_inline bool test_tif_rseq(unsigned long ti_work) { + return ti_work & _TIF_RSEQ; +} + +static __always_inline void clear_tif_rseq(void) +{ + static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); + clear_thread_flag(TIF_RSEQ); +} +#else +static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } +static __always_inline void clear_tif_rseq(void) { } +#endif + +static __always_inline bool +rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + if (likely(!test_tif_rseq(ti_work))) + return false; + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { current->rseq.event.slowpath = true; set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); return true; } + + clear_tif_rseq(); return false; } #else /* CONFIG_GENERIC_ENTRY */ -static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + return false; +} #endif /* !CONFIG_GENERIC_ENTRY */ static __always_inline void rseq_syscall_exit_to_user_mode(void) @@ -577,7 +603,7 @@ static inline void rseq_debug_syscall_return(struct pt_regs *regs) } #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } -static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) { return false; } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index dd925d84fa46..b40de9bab4b7 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -67,6 +67,11 @@ enum syscall_work_bit { #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED #endif +#ifndef TIF_RSEQ +# define TIF_RSEQ TIF_NOTIFY_RESUME +# define _TIF_RSEQ _TIF_NOTIFY_RESUME +#endif + #ifdef __KERNEL__ #ifndef arch_set_restart_data diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 523a3e758af4..5c792b30c58a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -11,6 +11,12 @@ /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) +#else +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) +#endif + static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) { @@ -18,7 +24,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re * Before returning to user space ensure that all pending work * items have been completed. */ - while (ti_work & EXIT_TO_USER_MODE_WORK) { + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { local_irq_enable_exit_to_user(ti_work); @@ -68,7 +74,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, for (;;) { ti_work = __exit_to_user_mode_loop(regs, ti_work); - if (likely(!rseq_exit_to_user_mode_restart(regs))) + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) return ti_work; ti_work = read_thread_flags(); } -- cgit v1.2.3 From 323d93f0432edb5415c79bd35e15e5754a76e486 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 31 Oct 2025 12:02:12 +0100 Subject: cleanup: Always inline everything KASAN bloat caused cleanup helper functions to not get inlined: vmlinux.o: error: objtool: irqentry_exit+0x323: call to class_user_rw_access_destructor() with UACCESS enabled Force inline all the cleanup helpers like they already are on normal builds. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251031105435.GU4068168@noisy.programming.kicks-ass.net --- include/linux/cleanup.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..d1806ac5342c 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -208,7 +208,7 @@ */ #define DEFINE_FREE(_name, _type, _free) \ - static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } + static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } #define __free(_name) __cleanup(__free_##_name) @@ -220,7 +220,7 @@ __val; \ }) -static inline __must_check +static __always_inline __must_check const volatile void * __must_check_fn(const volatile void *val) { return val; } @@ -274,16 +274,16 @@ const volatile void * __must_check_fn(const volatile void *val) #define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...) \ typedef _type class_##_name##_t; \ -static inline void class_##_name##_destructor(_type *p) \ +static __always_inline void class_##_name##_destructor(_type *p) \ { _type _T = *p; _exit; } \ -static inline _type class_##_name##_constructor(_init_args) \ +static __always_inline _type class_##_name##_constructor(_init_args) \ { _type t = _init; return t; } #define EXTEND_CLASS(_name, ext, _init, _init_args...) \ typedef class_##_name##_t class_##_name##ext##_t; \ -static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\ +static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \ { class_##_name##_destructor(p); } \ -static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ +static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ { class_##_name##_t t = _init; return t; } #define CLASS(_name, var) \ @@ -347,7 +347,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond }) #define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ - static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ { \ void *_ptr = (void *)(__force unsigned long)*(_exp); \ if (IS_ERR(_ptr)) { \ @@ -355,7 +355,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond } \ return _ptr; \ } \ - static inline int class_##_name##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \ { \ long _rc = (__force unsigned long)*(_exp); \ if (!_rc) { \ @@ -384,9 +384,9 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond EXTEND_CLASS(_name, _ext, \ ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \ class_##_name##_t _T) \ - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ { return class_##_name##_lock_ptr(_T); } \ - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ { return class_##_name##_lock_err(_T); } /* @@ -466,7 +466,7 @@ typedef struct { \ __VA_ARGS__; \ } class_##_name##_t; \ \ -static inline void class_##_name##_destructor(class_##_name##_t *_T) \ +static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ { \ if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \ } \ @@ -474,7 +474,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ -static inline class_##_name##_t class_##_name##_constructor(_type *l) \ +static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \ { \ class_##_name##_t _t = { .lock = l }, *_T = &_t; \ _lock; \ @@ -482,7 +482,7 @@ static inline class_##_name##_t class_##_name##_constructor(_type *l) \ } #define __DEFINE_LOCK_GUARD_0(_name, _lock) \ -static inline class_##_name##_t class_##_name##_constructor(void) \ +static __always_inline class_##_name##_t class_##_name##_constructor(void) \ { \ class_##_name##_t _t = { .lock = (void*)1 }, \ *_T __maybe_unused = &_t; \ @@ -508,9 +508,9 @@ __DEFINE_LOCK_GUARD_0(_name, _lock) if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\ _t; }), \ typeof_member(class_##_name##_t, lock) l) \ - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ { return class_##_name##_lock_ptr(_T); } \ - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ { return class_##_name##_lock_err(_T); } #define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \ -- cgit v1.2.3 From 4e97bae1b412cd6ed8053b3d8a242122952985cc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 00:12:40 +0100 Subject: cleanup: fix scoped_class() This is a class, not a guard so why on earth is it checking for guard pointers or conditional lock acquisition? None of it makes any sense at all. I'm not sure what happened back then. Maybe I had a brief psychedelic period that I completely forgot about and spaced out into a zone where that initial macro implementation made any sense at all. Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-1-cb3ec8711a6a@kernel.org Fixes: 5c21c5f22d07 ("cleanup: add a scoped version of CLASS()") Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cleanup.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..19c7e475d3a4 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -290,15 +290,16 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ class_##_name##_t var __cleanup(class_##_name##_destructor) = \ class_##_name##_constructor -#define scoped_class(_name, var, args) \ - for (CLASS(_name, var)(args); \ - __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \ - ({ goto _label; })) \ - if (0) { \ -_label: \ - break; \ +#define __scoped_class(_name, var, _label, args...) \ + for (CLASS(_name, var)(args); ; ({ goto _label; })) \ + if (0) { \ +_label: \ + break; \ } else +#define scoped_class(_name, var, args...) \ + __scoped_class(_name, var, __UNIQUE_ID(label), args) + /* * DEFINE_GUARD(name, type, lock, unlock): * trivial wrapper around DEFINE_CLASS() above specifically -- cgit v1.2.3 From 4c7ceeb62d3330b6fb2b549ae833a92c0f481f3e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 00:12:41 +0100 Subject: cred: add kernel_cred() helper Access kernel creds based off of init_task. This will let us avoid any direct access to init_cred. Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-2-cb3ec8711a6a@kernel.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 89ae50ad2ace..8ab3718184ad 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -20,6 +20,8 @@ struct cred; struct inode; +extern struct task_struct init_task; + /* * COW Supplementary groups list */ @@ -156,6 +158,11 @@ extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); +static inline const struct cred *kernel_cred(void) +{ + /* shut up sparse */ + return rcu_dereference_raw(init_task.cred); +} extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); -- cgit v1.2.3 From 40314c2818b700da695c9686348be7aef9e156a2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 00:12:42 +0100 Subject: cred: make init_cred static There's zero need to expose struct init_cred. The very few places that need access can just go through init_task which is already exported. Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-3-cb3ec8711a6a@kernel.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/init_task.h | 1 - init/init_task.c | 27 +++++++++++++++++++++++++++ kernel/cred.c | 27 --------------------------- security/keys/process_keys.c | 2 +- 4 files changed, 28 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index bccb3f1f6262..a6cb241ea00c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,7 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; extern struct nsproxy init_nsproxy; -extern struct cred init_cred; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #define INIT_PREV_CPUTIME(x) .prev_cputime = { \ diff --git a/init/init_task.c b/init/init_task.c index a55e2189206f..d970a847b657 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -62,6 +62,33 @@ unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = { }; #endif +/* init to 2 - one for init_task, one to ensure it is never freed */ +static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; + +/* + * The initial credentials for the initial task + */ +static struct cred init_cred = { + .usage = ATOMIC_INIT(4), + .uid = GLOBAL_ROOT_UID, + .gid = GLOBAL_ROOT_GID, + .suid = GLOBAL_ROOT_UID, + .sgid = GLOBAL_ROOT_GID, + .euid = GLOBAL_ROOT_UID, + .egid = GLOBAL_ROOT_GID, + .fsuid = GLOBAL_ROOT_UID, + .fsgid = GLOBAL_ROOT_GID, + .securebits = SECUREBITS_DEFAULT, + .cap_inheritable = CAP_EMPTY_SET, + .cap_permitted = CAP_FULL_SET, + .cap_effective = CAP_FULL_SET, + .cap_bset = CAP_FULL_SET, + .user = INIT_USER, + .user_ns = &init_user_ns, + .group_info = &init_groups, + .ucounts = &init_ucounts, +}; + /* * Set up the first task table, touch at your own risk!. Base=0, * limit=0x1fffff (=2MB) diff --git a/kernel/cred.c b/kernel/cred.c index dbf6b687dc5c..ac87ed9d43b1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,33 +35,6 @@ do { \ static struct kmem_cache *cred_jar; -/* init to 2 - one for init_task, one to ensure it is never freed */ -static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; - -/* - * The initial credentials for the initial task - */ -struct cred init_cred = { - .usage = ATOMIC_INIT(4), - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, - .suid = GLOBAL_ROOT_UID, - .sgid = GLOBAL_ROOT_GID, - .euid = GLOBAL_ROOT_UID, - .egid = GLOBAL_ROOT_GID, - .fsuid = GLOBAL_ROOT_UID, - .fsgid = GLOBAL_ROOT_GID, - .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_EMPTY_SET, - .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_FULL_SET, - .cap_bset = CAP_FULL_SET, - .user = INIT_USER, - .user_ns = &init_user_ns, - .group_info = &init_groups, - .ucounts = &init_ucounts, -}; - /* * The RCU callback to actually dispose of a set of credentials */ diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index b5d5333ab330..a63c46bb2d14 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -51,7 +51,7 @@ static struct key *get_user_register(struct user_namespace *user_ns) if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, - &init_cred, + kernel_cred(), KEY_POS_WRITE | KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ, 0, -- cgit v1.2.3 From ae40e6c65791f47c76cc14d0cce2707fe6053f72 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 00:12:43 +0100 Subject: cred: add scoped_with_kernel_creds() Add a new cleanup class for override creds. We can make use of this in a bunch of places going forward. Based on this scoped_with_kernel_creds() that can be used to temporarily assume kernel credentials for specific tasks such as firmware loading, or coredump socket connections. At no point will the caller interact with the kernel credentials directly. Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-4-cb3ec8711a6a@kernel.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 8ab3718184ad..be2cd07b174c 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -187,6 +187,14 @@ static inline const struct cred *revert_creds(const struct cred *revert_cred) return rcu_replace_pointer(current->cred, revert_cred, 1); } +DEFINE_CLASS(override_creds, + const struct cred *, + revert_creds(_T), + override_creds(override_cred), const struct cred *override_cred) + +#define scoped_with_kernel_creds() \ + scoped_class(override_creds, __UNIQUE_ID(cred), kernel_cred()) + /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference -- cgit v1.2.3 From 019e52e8d324d568e71730946beb11e7b275ff08 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 12:26:49 +0100 Subject: cred: add scoped_with_creds() guards and implement scoped_with_kernel_creds() on top of it. Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-1-a3e156839e7f@kernel.org Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/linux/cred.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index be2cd07b174c..6ea2d81a740b 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -192,8 +192,10 @@ DEFINE_CLASS(override_creds, revert_creds(_T), override_creds(override_cred), const struct cred *override_cred) -#define scoped_with_kernel_creds() \ - scoped_class(override_creds, __UNIQUE_ID(cred), kernel_cred()) +#define scoped_with_creds(cred) \ + scoped_class(override_creds, __UNIQUE_ID(label), cred) + +#define scoped_with_kernel_creds() scoped_with_creds(kernel_cred()) /** * get_cred_many - Get references on a set of credentials -- cgit v1.2.3 From c8ad3098e1272444b6c75910d6196a36f5c8bc17 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 15:57:27 +0100 Subject: cred: add prepare credential guard A lot of code uses the following pattern: * prepare new credentials * modify them for their use-case * drop them Support that easier with the new guard infrastructure. Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-1-b447b82f2c9b@kernel.org Signed-off-by: Christian Brauner --- include/linux/cred.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 6ea2d81a740b..343a140a6ba2 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -280,6 +280,11 @@ static inline void put_cred(const struct cred *cred) put_cred_many(cred, 1); } +DEFINE_CLASS(prepare_creds, + struct cred *, + if (_T) put_cred(_T), + prepare_creds(), void) + DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T)) /** -- cgit v1.2.3 From ec7f31b2a2d3bf6b9e4d4b8cd156587f1d0607d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Nov 2025 05:16:45 -0500 Subject: block: make bio auto-integrity deadlock safe The current block layer automatic integrity protection allocates the actual integrity buffer, which has three problems: - because it happens at the bottom of the I/O stack and doesn't use a mempool it can deadlock under load - because the data size in a bio is almost unbounded when using lage folios it can relatively easily exceed the maximum kmalloc size - even when it does not exceed the maximum kmalloc size, it could exceed the maximum segment size of the device Fix this by limiting the I/O size so that we can allocate at least a 2MiB integrity buffer, i.e. 128MiB for 8 byte PI and 512 byte integrity intervals, and create a mempool as a last resort for this maximum size, mirroring the scheme used for bvecs. As a nice upside none of this can fail now, so we remove the error handling and open code the trivial addition of the bip vec. The new allocation helpers sit outside of bio-integrity-auto.c because I plan to reuse them for file system based PI in the near future. Fixes: 7ba1ba12eeef ("block: Block layer data integrity support") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 22 +++----------------- block/bio-integrity.c | 48 +++++++++++++++++++++++++++++++++++++++++++ block/blk-settings.c | 21 +++++++++++++++++++ include/linux/bio-integrity.h | 6 ++++++ include/linux/blk-integrity.h | 5 +++++ 5 files changed, 83 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 2f4a244749ac..9850c338548d 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -29,7 +29,7 @@ static void bio_integrity_finish(struct bio_integrity_data *bid) { bid->bio->bi_integrity = NULL; bid->bio->bi_opf &= ~REQ_INTEGRITY; - kfree(bvec_virt(bid->bip.bip_vec)); + bio_integrity_free_buf(&bid->bip); mempool_free(bid, &bid_pool); } @@ -110,8 +110,6 @@ bool bio_integrity_prep(struct bio *bio) struct bio_integrity_data *bid; bool set_flags = true; gfp_t gfp = GFP_NOIO; - unsigned int len; - void *buf; if (!bi) return true; @@ -152,17 +150,12 @@ bool bio_integrity_prep(struct bio *bio) if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return true; - /* Allocate kernel buffer for protection data */ - len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, gfp); - if (!buf) - goto err_end_io; bid = mempool_alloc(&bid_pool, GFP_NOIO); bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); - bid->bio = bio; - bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; + bio_integrity_alloc_buf(bio, gfp & __GFP_ZERO); + bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); if (set_flags) { @@ -174,21 +167,12 @@ bool bio_integrity_prep(struct bio *bio) bid->bip.bip_flags |= BIP_CHECK_REFTAG; } - if (bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)) < len) - goto err_end_io; - /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip)) blk_integrity_generate(bio); else bid->saved_bio_iter = bio->bi_iter; return true; - -err_end_io: - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - return false; } EXPORT_SYMBOL(bio_integrity_prep); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bed26f1ec869..09eeaf6e74b8 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -14,6 +14,45 @@ struct bio_integrity_alloc { struct bio_vec bvecs[]; }; +static mempool_t integrity_buf_pool; + +void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + unsigned int len = bio_integrity_bytes(bi, bio_sectors(bio)); + gfp_t gfp = GFP_NOIO | (zero_buffer ? __GFP_ZERO : 0); + void *buf; + + buf = kmalloc(len, (gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN); + if (unlikely(!buf)) { + struct page *page; + + page = mempool_alloc(&integrity_buf_pool, GFP_NOFS); + if (zero_buffer) + memset(page_address(page), 0, len); + bvec_set_page(&bip->bip_vec[0], page, len, 0); + bip->bip_flags |= BIP_MEMPOOL; + } else { + bvec_set_page(&bip->bip_vec[0], virt_to_page(buf), len, + offset_in_page(buf)); + } + + bip->bip_vcnt = 1; + bip->bip_iter.bi_size = len; +} + +void bio_integrity_free_buf(struct bio_integrity_payload *bip) +{ + struct bio_vec *bv = &bip->bip_vec[0]; + + if (bip->bip_flags & BIP_MEMPOOL) + mempool_free(bv->bv_page, &integrity_buf_pool); + else + kfree(bvec_virt(bv)); +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed @@ -438,3 +477,12 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, return 0; } + +static int __init bio_integrity_initfn(void) +{ + if (mempool_init_page_pool(&integrity_buf_pool, BIO_POOL_SIZE, + get_order(BLK_INTEGRITY_MAX_SIZE))) + panic("bio: can't create integrity buf pool\n"); + return 0; +} +subsys_initcall(bio_integrity_initfn); diff --git a/block/blk-settings.c b/block/blk-settings.c index 345b6a271cc3..e0d0b035f39d 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -123,6 +123,19 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) return 0; } +/* + * Maximum size of I/O that needs a block layer integrity buffer. Limited + * by the number of intervals for which we can fit the integrity buffer into + * the buffer size. Because the buffer is a single segment it is also limited + * by the maximum segment size. + */ +static inline unsigned int max_integrity_io_size(struct queue_limits *lim) +{ + return min_t(unsigned int, lim->max_segment_size, + (BLK_INTEGRITY_MAX_SIZE / lim->integrity.metadata_size) << + lim->integrity.interval_exp); +} + static int blk_validate_integrity_limits(struct queue_limits *lim) { struct blk_integrity *bi = &lim->integrity; @@ -184,6 +197,14 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) if (!bi->interval_exp) bi->interval_exp = ilog2(lim->logical_block_size); + /* + * The block layer automatically adds integrity data for bios that don't + * already have it. Limit the I/O size so that a single maximum size + * metadata segment can cover the integrity data for the entire I/O. + */ + lim->max_sectors = min(lim->max_sectors, + max_integrity_io_size(lim) >> SECTOR_SHIFT); + return 0; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 851254f36eb3..3d05296a5afe 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -14,6 +14,8 @@ enum bip_flags { BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ BIP_P2P_DMA = 1 << 8, /* using P2P address */ + + BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; struct bio_integrity_payload { @@ -140,4 +142,8 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ + +void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); +void bio_integrity_free_buf(struct bio_integrity_payload *bip); + #endif /* _LINUX_BIO_INTEGRITY_H */ diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index b659373788f6..c2030fd8ba0a 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -8,6 +8,11 @@ struct request; +/* + * Maximum contiguous integrity buffer allocation. + */ +#define BLK_INTEGRITY_MAX_SIZE SZ_2M + enum blk_integrity_flags { BLK_INTEGRITY_NOVERIFY = 1 << 0, BLK_INTEGRITY_NOGENERATE = 1 << 1, -- cgit v1.2.3 From 1b6aa81c85621d6b55099906585ff09a477203b8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 3 Nov 2025 11:50:15 +0000 Subject: net: stmmac: add support for configuring the phy_intf_sel inputs When dwmac is synthesised with support for multiple PHY interfaces, the core provides phy_intf_sel inputs, sampled on reset, to configure the PHY facing interface. Use stmmac_get_phy_intf_sel() in core code to determine the dwmac phy_intf_sel input value, and provide a new platform method called with this value just before we issue a soft reset to the dwmac core. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vFt4h-0000000Chos-3wxX@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 34 +++++++++++++++++++++++ include/linux/stmmac.h | 1 + 2 files changed, 35 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 6d4323d04573..ccf383b355e7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3102,6 +3102,36 @@ int stmmac_get_phy_intf_sel(phy_interface_t interface) } EXPORT_SYMBOL_GPL(stmmac_get_phy_intf_sel); +static int stmmac_prereset_configure(struct stmmac_priv *priv) +{ + struct plat_stmmacenet_data *plat_dat = priv->plat; + phy_interface_t interface; + int phy_intf_sel, ret; + + if (!plat_dat->set_phy_intf_sel) + return 0; + + interface = plat_dat->phy_interface; + phy_intf_sel = stmmac_get_phy_intf_sel(interface); + if (phy_intf_sel < 0) { + netdev_err(priv->dev, + "failed to get phy_intf_sel for %s: %pe\n", + phy_modes(interface), ERR_PTR(phy_intf_sel)); + return phy_intf_sel; + } + + ret = plat_dat->set_phy_intf_sel(plat_dat->bsp_priv, phy_intf_sel); + if (ret == -EINVAL) + netdev_err(priv->dev, "platform does not support %s\n", + phy_modes(interface)); + else if (ret < 0) + netdev_err(priv->dev, + "platform failed to set interface %s: %pe\n", + phy_modes(interface), ERR_PTR(ret)); + + return ret; +} + /** * stmmac_init_dma_engine - DMA init. * @priv: driver private structure @@ -3128,6 +3158,10 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE)) priv->plat->dma_cfg->atds = 1; + ret = stmmac_prereset_configure(priv); + if (ret) + return ret; + ret = stmmac_reset(priv, priv->ioaddr); if (ret) { netdev_err(priv->dev, "Failed to reset the dma\n"); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 151c81c560c8..48e9f1d4e17e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -250,6 +250,7 @@ struct plat_stmmacenet_data { struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; void (*get_interfaces)(struct stmmac_priv *priv, void *bsp_priv, unsigned long *interfaces); + int (*set_phy_intf_sel)(void *priv, u8 phy_intf_sel); int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); -- cgit v1.2.3 From c3838262b824c71c145cd3668722e99a69bc9cd9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 31 Oct 2025 14:05:51 +0800 Subject: virtio_net: fix alignment for virtio_net_hdr_v1_hash Changing alignment of header would mean it's no longer safe to cast a 2 byte aligned pointer between formats. Use two 16 bit fields to make it 2 byte aligned as previously. This fixes the performance regression since commit ("virtio_net: enable gso over UDP tunnel support.") as it uses virtio_net_hdr_v1_hash_tunnel which embeds virtio_net_hdr_v1_hash. Pktgen in guest + XDP_DROP on TAP + vhost_net shows the TX PPS is recovered from 2.4Mpps to 4.45Mpps. Fixes: 56a06bd40fab ("virtio_net: enable gso over UDP tunnel support.") Cc: stable@vger.kernel.org Signed-off-by: Michael S. Tsirkin Signed-off-by: Jason Wang Tested-by: Lei Yang Link: https://patch.msgid.link/20251031060551.126-1-jasowang@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 15 +++++++++++++-- include/linux/virtio_net.h | 3 ++- include/uapi/linux/virtio_net.h | 3 ++- 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8e8a179aaa49..e6e650bc3bc3 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2539,6 +2539,13 @@ err_buf: return NULL; } +static inline u32 +virtio_net_hash_value(const struct virtio_net_hdr_v1_hash *hdr_hash) +{ + return __le16_to_cpu(hdr_hash->hash_value_lo) | + (__le16_to_cpu(hdr_hash->hash_value_hi) << 16); +} + static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, struct sk_buff *skb) { @@ -2565,7 +2572,7 @@ static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, default: rss_hash_type = PKT_HASH_TYPE_NONE; } - skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type); + skb_set_hash(skb, virtio_net_hash_value(hdr_hash), rss_hash_type); } static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, @@ -3311,6 +3318,10 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); + /* Make sure it's safe to cast between formats */ + BUILD_BUG_ON(__alignof__(*hdr) != __alignof__(hdr->hash_hdr)); + BUILD_BUG_ON(__alignof__(*hdr) != __alignof__(hdr->hash_hdr.hdr)); + can_push = vi->any_header_sg && !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) && !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len; @@ -6750,7 +6761,7 @@ static int virtnet_xdp_rx_hash(const struct xdp_md *_ctx, u32 *hash, hash_report = VIRTIO_NET_HASH_REPORT_NONE; *rss_type = virtnet_xdp_rss_type[hash_report]; - *hash = __le32_to_cpu(hdr_hash->hash_value); + *hash = virtio_net_hash_value(hdr_hash); return 0; } diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 4d1780848d0e..b673c31569f3 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -401,7 +401,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, if (!tnl_hdr_negotiated) return -EINVAL; - vhdr->hash_hdr.hash_value = 0; + vhdr->hash_hdr.hash_value_lo = 0; + vhdr->hash_hdr.hash_value_hi = 0; vhdr->hash_hdr.hash_report = 0; vhdr->hash_hdr.padding = 0; diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 8bf27ab8bcb4..1db45b01532b 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -193,7 +193,8 @@ struct virtio_net_hdr_v1 { struct virtio_net_hdr_v1_hash { struct virtio_net_hdr_v1 hdr; - __le32 hash_value; + __le16 hash_value_lo; + __le16 hash_value_hi; #define VIRTIO_NET_HASH_REPORT_NONE 0 #define VIRTIO_NET_HASH_REPORT_IPv4 1 #define VIRTIO_NET_HASH_REPORT_TCPv4 2 -- cgit v1.2.3 From 617a0dd24ef2b4e6240df48b1fbac1c3ebfa9282 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 3 Nov 2025 23:26:49 +0100 Subject: net: phy: make phy_device members pause and asym_pause bitfield bits We can reduce the size of struct phy_device a little by switching the type of members pause and asym_pause from int to a single bit. As C99 is supported now, we can use type bool for the bitfield members, what provides us with the benefit of the usual implicit bool conversions. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/764e9a31-b40b-4dc9-b808-118192a16d87@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy-c45.c | 20 ++++++++++---------- drivers/net/phy/phy_device.c | 16 ++++++++-------- include/linux/phy.h | 4 ++-- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 61670be0f095..1a7b32be4625 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -485,8 +485,8 @@ static int genphy_c45_baset1_read_lpa(struct phy_device *phydev) mii_t1_adv_l_mod_linkmode_t(phydev->lp_advertising, 0); mii_t1_adv_m_mod_linkmode_t(phydev->lp_advertising, 0); - phydev->pause = 0; - phydev->asym_pause = 0; + phydev->pause = false; + phydev->asym_pause = false; return 0; } @@ -498,8 +498,8 @@ static int genphy_c45_baset1_read_lpa(struct phy_device *phydev) return val; mii_t1_adv_l_mod_linkmode_t(phydev->lp_advertising, val); - phydev->pause = val & MDIO_AN_T1_ADV_L_PAUSE_CAP ? 1 : 0; - phydev->asym_pause = val & MDIO_AN_T1_ADV_L_PAUSE_ASYM ? 1 : 0; + phydev->pause = val & MDIO_AN_T1_ADV_L_PAUSE_CAP; + phydev->asym_pause = val & MDIO_AN_T1_ADV_L_PAUSE_ASYM; val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_T1_LP_M); if (val < 0) @@ -536,8 +536,8 @@ int genphy_c45_read_lpa(struct phy_device *phydev) phydev->lp_advertising); mii_10gbt_stat_mod_linkmode_lpa_t(phydev->lp_advertising, 0); mii_adv_mod_linkmode_adv_t(phydev->lp_advertising, 0); - phydev->pause = 0; - phydev->asym_pause = 0; + phydev->pause = false; + phydev->asym_pause = false; return 0; } @@ -551,8 +551,8 @@ int genphy_c45_read_lpa(struct phy_device *phydev) return val; mii_adv_mod_linkmode_adv_t(phydev->lp_advertising, val); - phydev->pause = val & LPA_PAUSE_CAP ? 1 : 0; - phydev->asym_pause = val & LPA_PAUSE_ASYM ? 1 : 0; + phydev->pause = val & LPA_PAUSE_CAP; + phydev->asym_pause = val & LPA_PAUSE_ASYM; /* Read the link partner's 10G advertisement */ val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_10GBT_STAT); @@ -1171,8 +1171,8 @@ int genphy_c45_read_status(struct phy_device *phydev) phydev->speed = SPEED_UNKNOWN; phydev->duplex = DUPLEX_UNKNOWN; - phydev->pause = 0; - phydev->asym_pause = 0; + phydev->pause = false; + phydev->asym_pause = false; if (phydev->autoneg == AUTONEG_ENABLE) { ret = genphy_c45_read_lpa(phydev); diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 737747cf1906..81984d4ebb7c 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -825,8 +825,8 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, dev->speed = SPEED_UNKNOWN; dev->duplex = DUPLEX_UNKNOWN; - dev->pause = 0; - dev->asym_pause = 0; + dev->pause = false; + dev->asym_pause = false; dev->link = 0; dev->port = PORT_TP; dev->interface = PHY_INTERFACE_MODE_GMII; @@ -2092,8 +2092,8 @@ int genphy_setup_forced(struct phy_device *phydev) { u16 ctl; - phydev->pause = 0; - phydev->asym_pause = 0; + phydev->pause = false; + phydev->asym_pause = false; ctl = mii_bmcr_encode_fixed(phydev->speed, phydev->duplex); @@ -2500,8 +2500,8 @@ int genphy_read_status(struct phy_device *phydev) phydev->master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; phydev->speed = SPEED_UNKNOWN; phydev->duplex = DUPLEX_UNKNOWN; - phydev->pause = 0; - phydev->asym_pause = 0; + phydev->pause = false; + phydev->asym_pause = false; if (phydev->is_gigabit_capable) { err = genphy_read_master_slave(phydev); @@ -2554,8 +2554,8 @@ int genphy_c37_read_status(struct phy_device *phydev, bool *changed) /* Signal link has changed */ *changed = true; phydev->duplex = DUPLEX_UNKNOWN; - phydev->pause = 0; - phydev->asym_pause = 0; + phydev->pause = false; + phydev->asym_pause = false; if (phydev->autoneg == AUTONEG_ENABLE && phydev->autoneg_complete) { lpa = phy_read(phydev, MII_LPA); diff --git a/include/linux/phy.h b/include/linux/phy.h index e3474f03cbc1..d145a200ea21 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -666,6 +666,8 @@ struct phy_device { /* The most recently read link state */ unsigned link:1; unsigned autoneg_complete:1; + bool pause:1; + bool asym_pause:1; /* Interrupts are enabled */ unsigned interrupts:1; @@ -690,8 +692,6 @@ struct phy_device { int speed; int duplex; int port; - int pause; - int asym_pause; u8 master_slave_get; u8 master_slave_set; u8 master_slave_state; -- cgit v1.2.3 From c9445e3c087656e01d0160a48f90389856baf368 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Oct 2025 22:41:19 +0100 Subject: net: phy: fixed_phy: add helper fixed_phy_register_100fd In few places a 100FD fixed PHY is used. Create a helper so that users don't have to define the struct fixed_phy_status. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/bf564b19-e9bc-4896-aeae-9f721cc4fecd@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/fixed_phy.c | 12 ++++++++++++ include/linux/phy_fixed.h | 6 ++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index 0e1b28f06f18..bdc3a4bffede 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -227,6 +227,18 @@ struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, } EXPORT_SYMBOL_GPL(fixed_phy_register); +struct phy_device *fixed_phy_register_100fd(void) +{ + static const struct fixed_phy_status status = { + .link = 1, + .speed = SPEED_100, + .duplex = DUPLEX_FULL, + }; + + return fixed_phy_register(&status, NULL); +} +EXPORT_SYMBOL_GPL(fixed_phy_register_100fd); + void fixed_phy_unregister(struct phy_device *phy) { phy_device_remove(phy); diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index d17ff750c708..08275ef64147 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -20,6 +20,7 @@ extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); void fixed_phy_add(const struct fixed_phy_status *status); struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np); +struct phy_device *fixed_phy_register_100fd(void); extern void fixed_phy_unregister(struct phy_device *phydev); extern int fixed_phy_set_link_update(struct phy_device *phydev, @@ -34,6 +35,11 @@ fixed_phy_register(const struct fixed_phy_status *status, return ERR_PTR(-ENODEV); } +static inline struct phy_device *fixed_phy_register_100fd(void) +{ + return ERR_PTR(-ENODEV); +} + static inline void fixed_phy_unregister(struct phy_device *phydev) { } -- cgit v1.2.3 From 5de9ea1c50f061892625388880e83fdc50a4ef66 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Oct 2025 22:46:32 +0100 Subject: net: phy: fixed_phy: remove fixed_phy_add fixed_phy_add() has a number of problems/disadvantages: - It uses phy address 0 w/o checking whether a fixed phy with this address exists already. - A subsequent call to fixed_phy_register() would also use phy address 0, because fixed_phy_add() doesn't mark it as used. - fixed_phy_add() is used from platform code, therefore requires that fixed_phy code is built-in. Now that for the only two users (coldfire/5272 and bcm47xx) fixed_phy creation has been moved to the respective ethernet driver (fec, b44), we can remove fixed_phy_add(). Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/bee046a1-1e77-4057-8b04-fdb2a1bbbd08@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/fixed_phy.c | 6 ------ include/linux/phy_fixed.h | 2 -- 2 files changed, 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index bdc3a4bffede..d498d8a9bba6 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -131,12 +131,6 @@ static int __fixed_phy_add(int phy_addr, return 0; } -void fixed_phy_add(const struct fixed_phy_status *status) -{ - __fixed_phy_add(0, status); -} -EXPORT_SYMBOL_GPL(fixed_phy_add); - static DEFINE_IDA(phy_fixed_ida); static void fixed_phy_del(int phy_addr) diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 08275ef64147..8bade999831c 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -17,7 +17,6 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); -void fixed_phy_add(const struct fixed_phy_status *status); struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np); struct phy_device *fixed_phy_register_100fd(void); @@ -27,7 +26,6 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, struct fixed_phy_status *)); #else -static inline void fixed_phy_add(const struct fixed_phy_status *status) {} static inline struct phy_device * fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np) -- cgit v1.2.3 From bf33247a90d3e85d53a9b55bb276b725456ff0bf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:09 -0800 Subject: net: Add struct sockaddr_unsized for sockaddr of unknown length Add flexible sockaddr structure to support addresses longer than the traditional 14-byte struct sockaddr::sa_data limitation without requiring the full 128-byte sa_data of struct sockaddr_storage. This allows the network APIs to pass around a pointer to an object that isn't lying to the compiler about how big it is, but must be accompanied by its actual size as an additional parameter. It's possible we may way to migrate to including the size with the struct in the future, e.g.: struct sockaddr_unsized { u16 sa_data_len; u16 sa_family; u8 sa_data[] __counted_by(sa_data_len); }; Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-1-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 3b262487ec06..7b1a01be29da 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -40,6 +40,23 @@ struct sockaddr { }; }; +/** + * struct sockaddr_unsized - Unspecified size sockaddr for callbacks + * @sa_family: Address family (AF_UNIX, AF_INET, AF_INET6, etc.) + * @sa_data: Flexible array for address data + * + * This structure is designed for callback interfaces where the + * total size is known via the sockaddr_len parameter. Unlike struct + * sockaddr which has a fixed 14-byte sa_data limit or struct + * sockaddr_storage which has a fixed 128-byte sa_data limit, this + * structure can accommodate addresses of any size, but must be used + * carefully. + */ +struct sockaddr_unsized { + __kernel_sa_family_t sa_family; /* address family, AF_xxx */ + char sa_data[]; /* flexible address data */ +}; + struct linger { int l_onoff; /* Linger active */ int l_linger; /* How long to linger for */ -- cgit v1.2.3 From 0e50474fa514822e9d990874e554bf8043a201d7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:10 -0800 Subject: net: Convert proto_ops bind() callbacks to use sockaddr_unsized Update all struct proto_ops bind() callback function prototypes from "struct sockaddr *" to "struct sockaddr_unsized *" to avoid lying to the compiler about object sizes. Calls into struct proto handlers gain casts that will be removed in the struct proto conversion patch. No binary changes expected. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-2-kees@kernel.org Signed-off-by: Jakub Kicinski --- crypto/af_alg.c | 2 +- drivers/block/drbd/drbd_receiver.c | 4 ++-- drivers/infiniband/hw/erdma/erdma_cm.c | 4 ++-- drivers/infiniband/sw/siw/siw_cm.c | 6 +++--- drivers/isdn/mISDN/l1oip_core.c | 2 +- drivers/isdn/mISDN/socket.c | 4 ++-- drivers/net/ppp/pptp.c | 4 ++-- drivers/nvme/host/tcp.c | 2 +- drivers/nvme/target/tcp.c | 2 +- drivers/target/iscsi/iscsi_target_login.c | 2 +- drivers/xen/pvcalls-back.c | 2 +- fs/afs/rxrpc.c | 6 +++--- fs/dlm/lowcomms.c | 6 +++--- fs/ocfs2/cluster/tcp.c | 4 ++-- fs/smb/client/connect.c | 2 +- fs/smb/server/transport_tcp.c | 4 ++-- include/linux/net.h | 4 ++-- include/net/inet_common.h | 2 +- include/net/ipv6.h | 2 +- include/net/sock.h | 2 +- net/9p/trans_fd.c | 2 +- net/appletalk/ddp.c | 2 +- net/atm/pvc.c | 4 ++-- net/atm/svc.c | 2 +- net/ax25/af_ax25.c | 2 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/iso.c | 4 ++-- net/bluetooth/l2cap_sock.c | 2 +- net/bluetooth/rfcomm/core.c | 4 ++-- net/bluetooth/rfcomm/sock.c | 2 +- net/bluetooth/sco.c | 2 +- net/can/isotp.c | 2 +- net/can/j1939/socket.c | 2 +- net/can/raw.c | 2 +- net/core/sock.c | 2 +- net/ieee802154/socket.c | 4 ++-- net/ipv4/af_inet.c | 4 ++-- net/ipv4/udp_tunnel_core.c | 2 +- net/ipv6/af_inet6.c | 4 ++-- net/ipv6/ip6_udp_tunnel.c | 2 +- net/iucv/af_iucv.c | 2 +- net/l2tp/l2tp_core.c | 4 ++-- net/llc/af_llc.c | 2 +- net/mctp/af_mctp.c | 2 +- net/mctp/test/route-test.c | 2 +- net/mptcp/protocol.c | 6 +++--- net/mptcp/subflow.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- net/netlink/af_netlink.c | 2 +- net/netrom/af_netrom.c | 2 +- net/nfc/llcp_sock.c | 4 ++-- net/packet/af_packet.c | 11 ++++++----- net/phonet/socket.c | 8 ++++---- net/qrtr/af_qrtr.c | 2 +- net/qrtr/ns.c | 2 +- net/rds/bind.c | 2 +- net/rds/rds.h | 2 +- net/rds/tcp_connect.c | 2 +- net/rds/tcp_listen.c | 2 +- net/rose/af_rose.c | 2 +- net/rxrpc/af_rxrpc.c | 2 +- net/rxrpc/rxperf.c | 2 +- net/smc/af_smc.c | 2 +- net/smc/smc.h | 2 +- net/socket.c | 6 +++--- net/sunrpc/clnt.c | 4 ++-- net/sunrpc/svcsock.c | 2 +- net/sunrpc/xprtsock.c | 4 ++-- net/tipc/socket.c | 4 ++-- net/unix/af_unix.c | 4 ++-- net/vmw_vsock/af_vsock.c | 4 ++-- net/x25/af_x25.c | 2 +- net/xdp/xsk.c | 2 +- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 2 +- 74 files changed, 113 insertions(+), 112 deletions(-) (limited to 'include/linux') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index ca6fdcc6c54a..5e760ab62618 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -145,7 +145,7 @@ void af_alg_release_parent(struct sock *sk) } EXPORT_SYMBOL_GPL(af_alg_release_parent); -static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int alg_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { const u32 allowed = CRYPTO_ALG_KERN_DRIVER_ONLY; struct sock *sk = sock->sk; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index caaf2781136d..d9296f74f902 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -450,7 +450,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection) * a free one dynamically. */ what = "bind before connect"; - err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); + err = sock->ops->bind(sock, (struct sockaddr_unsized *) &src_in6, my_addr_len); if (err < 0) goto out; @@ -537,7 +537,7 @@ static int prepare_listen_socket(struct drbd_connection *connection, struct acce drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); what = "bind before listen"; - err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); + err = s_listen->ops->bind(s_listen, (struct sockaddr_unsized *)&my_addr, my_addr_len); if (err < 0) goto out; diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c index e0acc185e719..ef66a6359eb9 100644 --- a/drivers/infiniband/hw/erdma/erdma_cm.c +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -993,7 +993,7 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, int ret; sock_set_reuseaddr(s->sk); - ret = s->ops->bind(s, laddr, laddrlen); + ret = s->ops->bind(s, (struct sockaddr_unsized *)laddr, laddrlen); if (ret) return ret; ret = s->ops->connect(s, raddr, raddrlen, flags); @@ -1315,7 +1315,7 @@ int erdma_create_listen(struct iw_cm_id *id, int backlog) if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) s->sk->sk_bound_dev_if = dev->netdev->ifindex; - ret = s->ops->bind(s, (struct sockaddr *)laddr, + ret = s->ops->bind(s, (struct sockaddr_unsized *)laddr, sizeof(struct sockaddr_in)); if (ret) goto error; diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 708b13993fdf..7fe118cacb3f 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1340,7 +1340,7 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, return rv; } - rv = s->ops->bind(s, laddr, size); + rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr, size); if (rv < 0) return rv; @@ -1789,7 +1789,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) goto error; } } - rv = s->ops->bind(s, (struct sockaddr *)laddr, + rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr, sizeof(struct sockaddr_in)); } else { struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr); @@ -1813,7 +1813,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) goto error; } } - rv = s->ops->bind(s, (struct sockaddr *)laddr, + rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr, sizeof(struct sockaddr_in6)); } if (rv) { diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c index f732f6614d37..6ab036e4a35f 100644 --- a/drivers/isdn/mISDN/l1oip_core.c +++ b/drivers/isdn/mISDN/l1oip_core.c @@ -676,7 +676,7 @@ l1oip_socket_thread(void *data) hc->sin_remote.sin_port = htons((unsigned short)hc->remoteport); /* bind to incoming port */ - if (socket->ops->bind(socket, (struct sockaddr *)&hc->sin_local, + if (socket->ops->bind(socket, (struct sockaddr_unsized *)&hc->sin_local, sizeof(hc->sin_local))) { printk(KERN_ERR "%s: Failed to bind socket to port %d.\n", __func__, hc->localport); diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index b215b28cad7b..77b900db1cac 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -462,7 +462,7 @@ static int data_sock_getsockopt(struct socket *sock, int level, int optname, } static int -data_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +data_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; @@ -696,7 +696,7 @@ base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } static int -base_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +base_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 90737cb71892..d07e87a0974c 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -382,8 +382,8 @@ drop: return NET_RX_DROP; } -static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr, - int sockaddr_len) +static int pptp_bind(struct socket *sock, struct sockaddr_unsized *uservaddr, + int sockaddr_len) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 9a96df1a511c..35d0bd91f6fd 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1834,7 +1834,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, sk_set_memalloc(queue->sock->sk); if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { - ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, + ret = kernel_bind(queue->sock, (struct sockaddr_unsized *)&ctrl->src_addr, sizeof(ctrl->src_addr)); if (ret) { dev_err(nctrl->device, diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 470bf37e5a63..d543da09ef8e 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -2055,7 +2055,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) if (so_priority > 0) sock_set_priority(port->sock->sk, so_priority); - ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, + ret = kernel_bind(port->sock, (struct sockaddr_unsized *)&port->addr, sizeof(port->addr)); if (ret) { pr_err("failed to bind port socket %d\n", ret); diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index c2ac9a99ebbb..53aca059dc16 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -822,7 +822,7 @@ int iscsit_setup_np( sock_set_reuseaddr(sock->sk); ip_sock_set_freebind(sock->sk); - ret = kernel_bind(sock, (struct sockaddr *)&np->np_sockaddr, len); + ret = kernel_bind(sock, (struct sockaddr_unsized *)&np->np_sockaddr, len); if (ret < 0) { pr_err("kernel_bind() failed: %d\n", ret); goto fail; diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index fd7ed65e0197..da1b516b9cfd 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -650,7 +650,7 @@ static int pvcalls_back_bind(struct xenbus_device *dev, if (ret < 0) goto out; - ret = inet_bind(map->sock, (struct sockaddr *)&req->u.bind.addr, + ret = inet_bind(map->sock, (struct sockaddr_unsized *)&req->u.bind.addr, req->u.bind.len); if (ret < 0) goto out; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index c1cadf8fb346..bf0e4ea0aafd 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -82,16 +82,16 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) pr_err("Couldn't create RxGK CM key: %d\n", ret); - ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; - ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); } if (ret < 0) goto error_2; srx.srx_service = YFS_CM_SERVICE; - ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); if (ret < 0) goto error_2; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9a0b6c2b6b01..0500421b6e3b 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1134,7 +1134,7 @@ static int sctp_bind_addrs(struct socket *sock, __be16 port) make_sockaddr(&localaddr, port, &addr_len); if (!i) - result = kernel_bind(sock, addr, addr_len); + result = kernel_bind(sock, (struct sockaddr_unsized *)addr, addr_len); else result = sock_bind_add(sock->sk, addr, addr_len); @@ -1813,7 +1813,7 @@ static int dlm_tcp_bind(struct socket *sock) memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr)); make_sockaddr(&src_addr, 0, &addr_len); - result = kernel_bind(sock, (struct sockaddr *)&src_addr, + result = kernel_bind(sock, (struct sockaddr_unsized *)&src_addr, addr_len); if (result < 0) { /* This *may* not indicate a critical error */ @@ -1852,7 +1852,7 @@ static int dlm_tcp_listen_bind(struct socket *sock) /* Bind to our port */ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); - return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0], + return kernel_bind(sock, (struct sockaddr_unsized *)&dlm_local_addr[0], addr_len); } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index b05d4e9d13b2..c7734193d8d7 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1615,7 +1615,7 @@ static void o2net_start_connect(struct work_struct *work) myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; myaddr.sin_port = htons(0); /* any port */ - ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, + ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&myaddr, sizeof(myaddr)); if (ret) { mlog(ML_ERROR, "bind failed with %d at address %pI4\n", @@ -2002,7 +2002,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) INIT_WORK(&o2net_listen_work, o2net_accept_many); sock->sk->sk_reuse = SK_CAN_REUSE; - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); if (ret < 0) { printk(KERN_ERR "o2net: Error %d while binding socket at " "%pI4:%u\n", ret, &addr, ntohs(port)); diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index dd12f3eb61dc..96d972263020 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3112,7 +3112,7 @@ bind_socket(struct TCP_Server_Info *server) struct socket *socket = server->ssocket; rc = kernel_bind(socket, - (struct sockaddr *) &server->srcaddr, + (struct sockaddr_unsized *) &server->srcaddr, sizeof(server->srcaddr)); if (rc < 0) { struct sockaddr_in *saddr4; diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 7a1e3dcc2cde..bf694bc78c65 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -519,10 +519,10 @@ static int create_socket(struct interface *iface) } if (ipv4) - ret = kernel_bind(ksmbd_socket, (struct sockaddr *)&sin, + ret = kernel_bind(ksmbd_socket, (struct sockaddr_unsized *)&sin, sizeof(sin)); else - ret = kernel_bind(ksmbd_socket, (struct sockaddr *)&sin6, + ret = kernel_bind(ksmbd_socket, (struct sockaddr_unsized *)&sin6, sizeof(sin6)); if (ret) { pr_err("Failed to bind socket: %d\n", ret); diff --git a/include/linux/net.h b/include/linux/net.h index ec09620f40f7..0e316f063113 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -163,7 +163,7 @@ struct proto_ops { struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, - struct sockaddr *myaddr, + struct sockaddr_unsized *myaddr, int sockaddr_len); int (*connect) (struct socket *sock, struct sockaddr *vaddr, @@ -345,7 +345,7 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len, int flags); -int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen); +int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen); int kernel_listen(struct socket *sock, int backlog); int kernel_accept(struct socket *sock, struct socket **newsock, int flags); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, diff --git a/include/net/inet_common.h b/include/net/inet_common.h index c17a6585d0b0..1666cf6f539e 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -42,7 +42,7 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); int __inet_listen_sk(struct sock *sk, int backlog); void inet_sock_destruct(struct sock *sk); -int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 2ccdf85f34f1..2188bad9a687 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1208,7 +1208,7 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); diff --git a/include/net/sock.h b/include/net/sock.h index c7e58b8e8a90..acbb78c96d69 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1920,7 +1920,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ -int sock_no_bind(struct socket *, struct sockaddr *, int); +int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index a516745f732f..ef517bb307e2 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -966,7 +966,7 @@ static int p9_bind_privport(struct socket *sock) ((struct sockaddr_in *)&stor)->sin_port = htons((ushort)port); else ((struct sockaddr_in6 *)&stor)->sin6_port = htons((ushort)port); - err = kernel_bind(sock, (struct sockaddr *)&stor, sizeof(stor)); + err = kernel_bind(sock, (struct sockaddr_unsized *)&stor, sizeof(stor)); if (err != -EADDRINUSE) break; } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 30242fe10341..45db43cde67f 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1149,7 +1149,7 @@ out: } /* Set the address 'our end' of the connection */ -static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int atalk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_at *addr = (struct sockaddr_at *)uaddr; struct sock *sk = sock->sk; diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 66d9a9bd5896..62fdf07c53de 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -24,7 +24,7 @@ static int pvc_shutdown(struct socket *sock, int how) return 0; } -static int pvc_bind(struct socket *sock, struct sockaddr *sockaddr, +static int pvc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len) { struct sock *sk = sock->sk; @@ -59,7 +59,7 @@ out: static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len, int flags) { - return pvc_bind(sock, sockaddr, sockaddr_len); + return pvc_bind(sock, (struct sockaddr_unsized *)sockaddr, sockaddr_len); } static int pvc_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/atm/svc.c b/net/atm/svc.c index f8137ae693b0..1906a493c8aa 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -97,7 +97,7 @@ static int svc_release(struct socket *sock) return 0; } -static int svc_bind(struct socket *sock, struct sockaddr *sockaddr, +static int svc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len) { DEFINE_WAIT(wait); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 6ef8b2a57a9b..23c558ff9682 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1094,7 +1094,7 @@ static int ax25_release(struct socket *sock) * that we've implemented support for SO_BINDTODEVICE. It is however small * and trivially backward compatible. */ -static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int ax25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index fc866759910d..ba9f48771e11 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1185,7 +1185,7 @@ static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd, } #endif -static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, +static int hci_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_hci haddr; diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 3d98cb6291da..6a7e1b4a8701 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -944,7 +944,7 @@ static int iso_sock_create(struct net *net, struct socket *sock, int protocol, return 0; } -static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, +static int iso_sock_bind_bc(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; @@ -1022,7 +1022,7 @@ done: return err; } -static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, +static int iso_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 814fb8610ac4..ca7394d8fa4e 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -80,7 +80,7 @@ static int l2cap_validate_le_psm(u16 psm) return 0; } -static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) +static int l2cap_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 96250807b32b..d62fd6c57617 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -781,7 +781,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, addr.l2_psm = 0; addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; - *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + *err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr)); if (*err < 0) goto failed; @@ -2068,7 +2068,7 @@ static int rfcomm_add_listener(bdaddr_t *ba) addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM); addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; - err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr)); if (err < 0) { BT_ERR("Bind failed %d", err); goto failed; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 913402806fa0..8c8762bbc6de 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, return 0; } -static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int rfcomm_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_rc sa; struct sock *sk = sock->sk; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index ab0cf442d57b..01d878205e58 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -605,7 +605,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol, return 0; } -static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, +static int sco_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; diff --git a/net/can/isotp.c b/net/can/isotp.c index 74ee1e52249b..ce588b85665a 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1246,7 +1246,7 @@ static int isotp_release(struct socket *sock) return 0; } -static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) +static int isotp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 88e7160d4248..a2abedc757d0 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -440,7 +440,7 @@ static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) return 0; } -static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) +static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); diff --git a/net/can/raw.c b/net/can/raw.c index a53853f5e9af..f36a83d3447c 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -449,7 +449,7 @@ static int raw_release(struct socket *sock) return 0; } -static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) +static int raw_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; diff --git a/net/core/sock.c b/net/core/sock.c index 7a9bbc2afcf0..1e1ce18bba16 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3462,7 +3462,7 @@ EXPORT_SYMBOL_GPL(sk_set_peek_off); * function, some default processing is provided. */ -int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { return -EOPNOTSUPP; } diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 18d267921bb5..99ddfad9bb88 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -96,13 +96,13 @@ static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg, return sk->sk_prot->sendmsg(sk, msg, len); } -static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, +static int ieee802154_sock_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; if (sk->sk_prot->bind) - return sk->sk_prot->bind(sk, uaddr, addr_len); + return sk->sk_prot->bind(sk, (struct sockaddr *)uaddr, addr_len); return sock_no_bind(sock, uaddr, addr_len); } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0784e2a873a1..aa43d16e48ff 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -464,9 +464,9 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) return __inet_bind(sk, uaddr, addr_len, flags); } -int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { - return inet_bind_sk(sock->sk, uaddr, addr_len); + return inet_bind_sk(sock->sk, (struct sockaddr *)uaddr, addr_len); } EXPORT_SYMBOL(inet_bind); diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 54386e06a813..11e5a88c923d 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -29,7 +29,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&udp_addr, sizeof(udp_addr)); if (err < 0) goto error; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 44d7de1eec4f..c92d27e35fbc 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -465,9 +465,9 @@ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) } /* bind for INET6 API */ -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { - return inet6_bind_sk(sock->sk, uaddr, addr_len); + return inet6_bind_sk(sock->sk, (struct sockaddr *)uaddr, addr_len); } EXPORT_SYMBOL(inet6_bind); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 0ff547a4bff7..b0d9286b33c8 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -40,7 +40,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr)); if (err < 0) goto error; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 4ddfc633d30c..3941e32cda69 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -563,7 +563,7 @@ static void __iucv_auto_name(struct iucv_sock *iucv) } /* Bind an unbound socket */ -static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, +static int iucv_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 369a2f2e459c..4b5e372a5cd4 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1503,7 +1503,7 @@ static int l2tp_tunnel_sock_create(struct net *net, memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = tunnel_id; - err = kernel_bind(sock, (struct sockaddr *)&ip6_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&ip6_addr, sizeof(ip6_addr)); if (err < 0) goto out; @@ -1530,7 +1530,7 @@ static int l2tp_tunnel_sock_create(struct net *net, ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; - err = kernel_bind(sock, (struct sockaddr *)&ip_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&ip_addr, sizeof(ip_addr)); if (err < 0) goto out; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5958a80fe14c..e5bb0c0d708c 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -337,7 +337,7 @@ out: * otherwise all hell will break loose. * Returns: 0 upon success, negative otherwise. */ -static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) +static int llc_ui_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addrlen) { struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; struct sock *sk = sock->sk; diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index b99ba14f39d2..5b1ef50637b7 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -49,7 +49,7 @@ static bool mctp_sockaddr_ext_is_ok(const struct sockaddr_mctp_ext *addr) !addr->__smctp_pad0[2]; } -static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +static int mctp_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen) { struct sock *sk = sock->sk; struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 69a3ccfc6310..be9149ac79dd 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -205,7 +205,7 @@ static void __mctp_route_test_init(struct kunit *test, addr.smctp_network = netid; addr.smctp_addr.s_addr = 8; addr.smctp_type = 0; - rc = kernel_bind(sock, (struct sockaddr *)&addr, sizeof(addr)); + rc = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr)); KUNIT_ASSERT_EQ(test, rc, 0); *devp = dev; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d568575cdcb5..53e2b095dfb1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3856,7 +3856,7 @@ static struct proto mptcp_prot = { .no_autobind = true, }; -static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int mptcp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *sk = sock->sk; @@ -3870,10 +3870,10 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } if (sk->sk_family == AF_INET) - err = inet_bind_sk(ssk, uaddr, addr_len); + err = inet_bind_sk(ssk, (struct sockaddr *)uaddr, addr_len); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (sk->sk_family == AF_INET6) - err = inet6_bind_sk(ssk, uaddr, addr_len); + err = inet6_bind_sk(ssk, (struct sockaddr *)uaddr, addr_len); #endif if (!err) mptcp_copy_inaddrs(sk, ssk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e8325890a322..d90237bf433c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1660,7 +1660,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, addrlen = sizeof(struct sockaddr_in6); #endif ssk->sk_bound_dev_if = local->ifindex; - err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); + err = kernel_bind(sf, (struct sockaddr_unsized *)&addr, addrlen); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR); pr_debug("msk=%p local=%d remote=%d bind error: %d\n", diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3402675bf521..d8c089ef387c 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1435,7 +1435,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) sin.sin_addr.s_addr = addr; sin.sin_port = 0; - return kernel_bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + return kernel_bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); } static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, @@ -1542,7 +1542,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id, get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); sock->sk->sk_bound_dev_if = dev->ifindex; - result = kernel_bind(sock, (struct sockaddr *)&mcast_addr, salen); + result = kernel_bind(sock, (struct sockaddr_unsized *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 687a84c48882..18490a56edd0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -966,7 +966,7 @@ static void netlink_undo_bind(int group, long unsigned int groups, nlk->netlink_unbind(sock_net(sk), undo + 1); } -static int netlink_bind(struct socket *sock, struct sockaddr *addr, +static int netlink_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sock *sk = sock->sk; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 3331669d8e33..33468124d53d 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -561,7 +561,7 @@ static int nr_release(struct socket *sock) return 0; } -static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int nr_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 57a2f97004e1..26e6ceb48a82 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -56,7 +56,7 @@ static struct proto llcp_sock_proto = { .obj_size = sizeof(struct nfc_llcp_sock), }; -static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) +static int llcp_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); @@ -146,7 +146,7 @@ error: return ret; } -static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr *addr, +static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 173e6edda08f..fccad2a529cc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3279,11 +3279,12 @@ out_unlock: * Bind a packet socket to a device */ -static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, +static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; - char name[sizeof(uaddr->sa_data_min) + 1]; + struct sockaddr *sa = (struct sockaddr *)uaddr; + char name[sizeof(sa->sa_data_min) + 1]; /* * Check legality @@ -3294,13 +3295,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ - memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); - name[sizeof(uaddr->sa_data_min)] = 0; + memcpy(name, sa->sa_data, sizeof(sa->sa_data_min)); + name[sizeof(sa->sa_data_min)] = 0; return packet_do_bind(sk, name, 0, 0); } -static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int packet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index db2d552e9b32..478b02647733 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(pn_sock_unhash); static DEFINE_MUTEX(port_mutex); -static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) +static int pn_socket_bind(struct socket *sock, struct sockaddr_unsized *addr, int len) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); @@ -163,7 +163,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) u8 saddr; if (sk->sk_prot->bind) - return sk->sk_prot->bind(sk, addr, len); + return sk->sk_prot->bind(sk, (struct sockaddr *)addr, len); if (len < sizeof(struct sockaddr_pn)) return -EINVAL; @@ -206,8 +206,8 @@ static int pn_socket_autobind(struct socket *sock) memset(&sa, 0, sizeof(sa)); sa.spn_family = AF_PHONET; - err = pn_socket_bind(sock, (struct sockaddr *)&sa, - sizeof(struct sockaddr_pn)); + err = pn_socket_bind(sock, (struct sockaddr_unsized *)&sa, + sizeof(struct sockaddr_pn)); if (err != -EINVAL) return err; BUG_ON(!pn_port(pn_sk(sock->sk)->sobject)); diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 00c51cf693f3..00bd3dd9f0f9 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -824,7 +824,7 @@ static int qrtr_autobind(struct socket *sock) } /* Bind socket to specified sockaddr. */ -static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len) +static int qrtr_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 3de9350cbf30..bfcc1a453f23 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -714,7 +714,7 @@ int qrtr_ns_init(void) sq.sq_port = QRTR_PORT_CTRL; qrtr_ns.local_node = sq.sq_node; - ret = kernel_bind(qrtr_ns.sock, (struct sockaddr *)&sq, sizeof(sq)); + ret = kernel_bind(qrtr_ns.sock, (struct sockaddr_unsized *)&sq, sizeof(sq)); if (ret < 0) { pr_err("failed to bind to socket\n"); goto err_wq; diff --git a/net/rds/bind.c b/net/rds/bind.c index 97a29172a8ee..f800d920d969 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -160,7 +160,7 @@ void rds_remove_bound(struct rds_sock *rs) rs->rs_bound_addr = in6addr_any; } -int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int rds_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); diff --git a/net/rds/rds.h b/net/rds/rds.h index 5b1c072e2e7f..a029e5fcdea7 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -735,7 +735,7 @@ extern wait_queue_head_t rds_poll_waitq; /* bind.c */ -int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int rds_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index a0046e99d6df..1eff3b03ab77 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -145,7 +145,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) addrlen = sizeof(sin); } - ret = kernel_bind(sock, addr, addrlen); + ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen); if (ret) { rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 91e34af3fe5d..820d3e20de19 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -290,7 +290,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) addr_len = sizeof(*sin); } - ret = kernel_bind(sock, (struct sockaddr *)&ss, addr_len); + ret = kernel_bind(sock, (struct sockaddr_unsized *)&ss, addr_len); if (ret < 0) { rdsdebug("could not bind %s listener socket: %d\n", isv6 ? "IPv6" : "IPv4", ret); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 543f9e8ebb69..47369eab5aec 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -693,7 +693,7 @@ static int rose_release(struct socket *sock) return 0; } -static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int rose_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 36df0274d7b7..245f37a74394 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -127,7 +127,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, /* * bind a local address to an RxRPC socket */ -static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) +static int rxrpc_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct rxrpc_local *local; diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 2ea71e3831f7..98ea76fae70f 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -211,7 +211,7 @@ static int rxperf_open_socket(void) ret = rxrpc_sock_set_security_keyring(socket->sk, rxperf_sec_keyring); - ret = kernel_bind(socket, (struct sockaddr *)&srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *)&srx, sizeof(srx)); if (ret < 0) goto error_2; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e9d0e62e0b1b..be18ab08f15d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -421,7 +421,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, return sk; } -int smc_bind(struct socket *sock, struct sockaddr *uaddr, +int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; diff --git a/net/smc/smc.h b/net/smc/smc.h index 2c9084963739..a008dbe6d6f6 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -42,7 +42,7 @@ void smc_unhash_sk(struct sock *sk); void smc_release_cb(struct sock *sk); int smc_release(struct socket *sock); -int smc_bind(struct socket *sock, struct sockaddr *uaddr, +int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); int smc_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags); diff --git a/net/socket.c b/net/socket.c index e8892b218708..aaefb2e519a7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1872,7 +1872,7 @@ int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, addrlen); if (!err) err = READ_ONCE(sock->ops)->bind(sock, - (struct sockaddr *)address, + (struct sockaddr_unsized *)address, addrlen); return err; } @@ -3583,13 +3583,13 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, * Returns 0 or an error. */ -int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); - return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *)&address, + return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr_unsized *)&address, addrlen); } EXPORT_SYMBOL(kernel_bind); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8ca354ecfd02..318ee24ad900 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1457,12 +1457,12 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, switch (sap->sa_family) { case AF_INET: err = kernel_bind(sock, - (struct sockaddr *)&rpc_inaddr_loopback, + (struct sockaddr_unsized *)&rpc_inaddr_loopback, sizeof(rpc_inaddr_loopback)); break; case AF_INET6: err = kernel_bind(sock, - (struct sockaddr *)&rpc_in6addr_loopback, + (struct sockaddr_unsized *)&rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); break; default: diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 7b90abc5cf0e..16ff6c100821 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1557,7 +1557,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ - error = kernel_bind(sock, sin, len); + error = kernel_bind(sock, (struct sockaddr_unsized *)sin, len); if (error < 0) goto bummer; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 3aa987e7f072..95732a45b059 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1845,8 +1845,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { rpc_set_port((struct sockaddr *)&myaddr, port); - err = kernel_bind(sock, (struct sockaddr *)&myaddr, - transport->xprt.addrlen); + err = kernel_bind(sock, (struct sockaddr_unsized *)&myaddr, + transport->xprt.addrlen); if (err == 0) { if (transport->xprt.reuseport) transport->srcport = port; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index bc614a1f019c..3903a97ada7d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -710,7 +710,7 @@ int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen) return res; } -static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) +static int tipc_bind(struct socket *sock, struct sockaddr_unsized *skaddr, int alen) { struct tipc_uaddr *ua = (struct tipc_uaddr *)skaddr; u32 atype = ua->addrtype; @@ -726,7 +726,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) return -EACCES; } } - return tipc_sk_bind(sock, skaddr, alen); + return tipc_sk_bind(sock, (struct sockaddr *)skaddr, alen); } /** diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 54177caa9c12..788775f0eea7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -843,7 +843,7 @@ out: } static int unix_release(struct socket *); -static int unix_bind(struct socket *, struct sockaddr *, int); +static int unix_bind(struct socket *, struct sockaddr_unsized *, int); static int unix_stream_connect(struct socket *, struct sockaddr *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); @@ -1466,7 +1466,7 @@ out: return err; } -static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int unix_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 76763247a377..0e5609e7284b 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -987,7 +987,7 @@ static int vsock_release(struct socket *sock) } static int -vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +vsock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { int err; struct sock *sk; @@ -995,7 +995,7 @@ vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) sk = sock->sk; - if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) + if (vsock_addr_cast((struct sockaddr *)addr, addr_len, &vm_addr) != 0) return -EINVAL; lock_sock(sk); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 655d1e0ae25f..ca8006d8f792 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -670,7 +670,7 @@ out: return 0; } -static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int x25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 2f26c918d448..ed8b612ec29d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1238,7 +1238,7 @@ static bool xsk_validate_queues(struct xdp_sock *xs) return xs->fq_tmp && xs->cq_tmp; } -static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 8074bc5f6f20..0497b5dea25c 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -923,7 +923,7 @@ __bpf_kfunc int bpf_kfunc_call_kernel_bind(struct addr_args *args) goto out; } - err = kernel_bind(sock, (struct sockaddr *)&args->addr, args->addrlen); + err = kernel_bind(sock, (struct sockaddr_unsized *)&args->addr, args->addrlen); out: mutex_unlock(&sock_lock); -- cgit v1.2.3 From 85cb0757d7e1f9370a8b52a8b8144c37941cba0a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:11 -0800 Subject: net: Convert proto_ops connect() callbacks to use sockaddr_unsized Update all struct proto_ops connect() callback function prototypes from "struct sockaddr *" to "struct sockaddr_unsized *" to avoid lying to the compiler about object sizes. Calls into struct proto handlers gain casts that will be removed in the struct proto conversion patch. No binary changes expected. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-3-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/infiniband/hw/erdma/erdma_cm.c | 2 +- drivers/infiniband/sw/siw/siw_cm.c | 2 +- drivers/net/ppp/pppoe.c | 4 ++-- drivers/net/ppp/pptp.c | 4 ++-- drivers/net/wireless/ath/ath10k/qmi.c | 2 +- drivers/net/wireless/ath/ath11k/qmi.c | 2 +- drivers/net/wireless/ath/ath12k/qmi.c | 2 +- drivers/nvme/host/tcp.c | 2 +- drivers/slimbus/qcom-ngd-ctrl.c | 2 +- drivers/xen/pvcalls-back.c | 2 +- fs/coredump.c | 2 +- fs/dlm/lowcomms.c | 2 +- fs/ocfs2/cluster/tcp.c | 2 +- fs/smb/client/connect.c | 2 +- include/linux/bpf-cgroup.h | 6 +++--- include/linux/net.h | 4 ++-- include/net/inet_common.h | 6 +++--- include/net/sctp/sctp.h | 2 +- include/net/sock.h | 2 +- include/net/vsock_addr.h | 2 +- net/9p/trans_fd.c | 6 +++--- net/appletalk/ddp.c | 2 +- net/atm/pvc.c | 4 ++-- net/atm/svc.c | 2 +- net/ax25/af_ax25.c | 2 +- net/bluetooth/iso.c | 2 +- net/bluetooth/l2cap_sock.c | 2 +- net/bluetooth/rfcomm/core.c | 2 +- net/bluetooth/rfcomm/sock.c | 3 ++- net/bluetooth/sco.c | 2 +- net/caif/caif_socket.c | 2 +- net/can/bcm.c | 2 +- net/can/j1939/socket.c | 2 +- net/ceph/messenger.c | 2 +- net/core/sock.c | 2 +- net/ieee802154/socket.c | 4 ++-- net/ipv4/af_inet.c | 14 +++++++------- net/ipv4/tcp.c | 2 +- net/ipv4/udp_tunnel_core.c | 2 +- net/ipv6/ip6_udp_tunnel.c | 2 +- net/iucv/af_iucv.c | 4 ++-- net/l2tp/l2tp_core.c | 4 ++-- net/l2tp/l2tp_ppp.c | 2 +- net/llc/af_llc.c | 2 +- net/mctp/af_mctp.c | 2 +- net/mctp/test/utils.c | 5 +++-- net/mptcp/subflow.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 2 +- net/netlink/af_netlink.c | 2 +- net/netrom/af_netrom.c | 4 ++-- net/nfc/llcp_sock.c | 2 +- net/nfc/rawsock.c | 2 +- net/phonet/socket.c | 6 +++--- net/qrtr/af_qrtr.c | 2 +- net/rds/af_rds.c | 2 +- net/rds/tcp_connect.c | 2 +- net/rose/af_rose.c | 3 ++- net/rxrpc/af_rxrpc.c | 2 +- net/sctp/socket.c | 4 ++-- net/smc/af_smc.c | 4 ++-- net/smc/smc.h | 2 +- net/socket.c | 8 ++++---- net/sunrpc/clnt.c | 2 +- net/sunrpc/xprtsock.c | 5 +++-- net/tipc/socket.c | 2 +- net/unix/af_unix.c | 8 ++++---- net/vmw_vsock/af_vsock.c | 6 +++--- net/vmw_vsock/vsock_addr.c | 2 +- net/x25/af_x25.c | 2 +- samples/qmi/qmi_sample_client.c | 2 +- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 2 +- 72 files changed, 110 insertions(+), 106 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index d9296f74f902..33bc91665fe8 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -458,7 +458,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection) * stay C_WF_CONNECTION, don't go Disconnecting! */ disconnect_on_error = 0; what = "connect"; - err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); + err = sock->ops->connect(sock, (struct sockaddr_unsized *) &peer_in6, peer_addr_len, 0); out: if (err < 0) { diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c index ef66a6359eb9..ed21ba0037a4 100644 --- a/drivers/infiniband/hw/erdma/erdma_cm.c +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -996,7 +996,7 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, ret = s->ops->bind(s, (struct sockaddr_unsized *)laddr, laddrlen); if (ret) return ret; - ret = s->ops->connect(s, raddr, raddrlen, flags); + ret = s->ops->connect(s, (struct sockaddr_unsized *)raddr, raddrlen, flags); return ret < 0 ? ret : 0; } diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 7fe118cacb3f..eb0bd4f79a85 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1344,7 +1344,7 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, if (rv < 0) return rv; - rv = s->ops->connect(s, raddr, size, flags); + rv = s->ops->connect(s, (struct sockaddr_unsized *)raddr, size, flags); return rv < 0 ? rv : 0; } diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 4ac6afce267b..4275b393a454 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -608,8 +608,8 @@ static int pppoe_release(struct socket *sock) return 0; } -static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, - int sockaddr_len, int flags) +static int pppoe_connect(struct socket *sock, struct sockaddr_unsized *uservaddr, + int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *)uservaddr; diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index d07e87a0974c..b18acd810561 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -415,8 +415,8 @@ out: return error; } -static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, - int sockaddr_len, int flags) +static int pptp_connect(struct socket *sock, struct sockaddr_unsized *uservaddr, + int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr; diff --git a/drivers/net/wireless/ath/ath10k/qmi.c b/drivers/net/wireless/ath/ath10k/qmi.c index f1f33af0170a..8275345631a0 100644 --- a/drivers/net/wireless/ath/ath10k/qmi.c +++ b/drivers/net/wireless/ath/ath10k/qmi.c @@ -986,7 +986,7 @@ static int ath10k_qmi_new_server(struct qmi_handle *qmi_hdl, ath10k_dbg(ar, ATH10K_DBG_QMI, "wifi fw qmi service found\n"); - ret = kernel_connect(qmi_hdl->sock, (struct sockaddr *)&qmi->sq, + ret = kernel_connect(qmi_hdl->sock, (struct sockaddr_unsized *)&qmi->sq, sizeof(qmi->sq), 0); if (ret) { ath10k_err(ar, "failed to connect to a remote QMI service port\n"); diff --git a/drivers/net/wireless/ath/ath11k/qmi.c b/drivers/net/wireless/ath/ath11k/qmi.c index aea56c38bf8f..ff6a97e328b8 100644 --- a/drivers/net/wireless/ath/ath11k/qmi.c +++ b/drivers/net/wireless/ath/ath11k/qmi.c @@ -3177,7 +3177,7 @@ static int ath11k_qmi_ops_new_server(struct qmi_handle *qmi_hdl, sq->sq_node = service->node; sq->sq_port = service->port; - ret = kernel_connect(qmi_hdl->sock, (struct sockaddr *)sq, + ret = kernel_connect(qmi_hdl->sock, (struct sockaddr_unsized *)sq, sizeof(*sq), 0); if (ret) { ath11k_warn(ab, "failed to connect to qmi remote service: %d\n", ret); diff --git a/drivers/net/wireless/ath/ath12k/qmi.c b/drivers/net/wireless/ath/ath12k/qmi.c index 36325e62aa24..cf9c25df3ffd 100644 --- a/drivers/net/wireless/ath/ath12k/qmi.c +++ b/drivers/net/wireless/ath/ath12k/qmi.c @@ -3740,7 +3740,7 @@ static int ath12k_qmi_ops_new_server(struct qmi_handle *qmi_hdl, sq->sq_node = service->node; sq->sq_port = service->port; - ret = kernel_connect(qmi_hdl->sock, (struct sockaddr *)sq, + ret = kernel_connect(qmi_hdl->sock, (struct sockaddr_unsized *)sq, sizeof(*sq), 0); if (ret) { ath12k_warn(ab, "qmi failed to connect to remote service %d\n", ret); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 35d0bd91f6fd..6795b8286c35 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1872,7 +1872,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, dev_dbg(nctrl->device, "connecting queue %d\n", nvme_tcp_queue_id(queue)); - ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, + ret = kernel_connect(queue->sock, (struct sockaddr_unsized *)&ctrl->addr, sizeof(ctrl->addr), 0); if (ret) { dev_err(nctrl->device, diff --git a/drivers/slimbus/qcom-ngd-ctrl.c b/drivers/slimbus/qcom-ngd-ctrl.c index 4fb66986cc22..fdb94dc4a730 100644 --- a/drivers/slimbus/qcom-ngd-ctrl.c +++ b/drivers/slimbus/qcom-ngd-ctrl.c @@ -463,7 +463,7 @@ static int qcom_slim_qmi_init(struct qcom_slim_ngd_ctrl *ctrl, } rc = kernel_connect(handle->sock, - (struct sockaddr *)&ctrl->qmi.svc_info, + (struct sockaddr_unsized *)&ctrl->qmi.svc_info, sizeof(ctrl->qmi.svc_info), 0); if (rc < 0) { dev_err(ctrl->dev, "Remote Service connect failed: %d\n", rc); diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index da1b516b9cfd..c5b6f6fa11eb 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -409,7 +409,7 @@ static int pvcalls_back_connect(struct xenbus_device *dev, ret = sock_create(AF_INET, SOCK_STREAM, 0, &sock); if (ret < 0) goto out; - ret = inet_stream_connect(sock, sa, req->u.connect.len, 0); + ret = inet_stream_connect(sock, (struct sockaddr_unsized *)sa, req->u.connect.len, 0); if (ret < 0) { sock_release(sock); goto out; diff --git a/fs/coredump.c b/fs/coredump.c index 5c1c381ee380..14837d9e2abb 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -708,7 +708,7 @@ static bool coredump_sock_connect(struct core_name *cn, struct coredump_params * */ pidfs_coredump(cprm); - retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len, + retval = kernel_connect(socket, (struct sockaddr_unsized *)(&addr), addr_len, O_NONBLOCK | SOCK_COREDUMP); if (retval) { diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 0500421b6e3b..f832dafdaca8 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1599,7 +1599,7 @@ static int dlm_connect(struct connection *con) log_print_ratelimited("connecting to %d", con->nodeid); make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); - result = kernel_connect(sock, (struct sockaddr *)&addr, addr_len, 0); + result = kernel_connect(sock, (struct sockaddr_unsized *)&addr, addr_len, 0); switch (result) { case -EINPROGRESS: /* not an error */ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index c7734193d8d7..79b281e32f4c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1638,7 +1638,7 @@ static void o2net_start_connect(struct work_struct *work) remoteaddr.sin_port = node->nd_ipv4_port; ret = sc->sc_sock->ops->connect(sc->sc_sock, - (struct sockaddr *)&remoteaddr, + (struct sockaddr_unsized *)&remoteaddr, sizeof(remoteaddr), O_NONBLOCK); if (ret == -EINPROGRESS) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 96d972263020..73120988661a 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3411,7 +3411,7 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); - rc = kernel_connect(socket, saddr, slen, + rc = kernel_connect(socket, (struct sockaddr_unsized *)saddr, slen, server->noblockcnt ? O_NONBLOCK : 0); /* * When mounting SMB root file systems, we do not want to block in diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index aedf573bdb42..a7fb4f46974f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -238,7 +238,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ atype, NULL, NULL); \ __ret; \ }) @@ -248,7 +248,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ atype, t_ctx, NULL); \ release_sock(sk); \ } \ @@ -266,7 +266,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ atype, NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ diff --git a/include/linux/net.h b/include/linux/net.h index 0e316f063113..db6bc997ca5b 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -166,7 +166,7 @@ struct proto_ops { struct sockaddr_unsized *myaddr, int sockaddr_len); int (*connect) (struct socket *sock, - struct sockaddr *vaddr, + struct sockaddr_unsized *vaddr, int sockaddr_len, int flags); int (*socketpair)(struct socket *sock1, struct socket *sock2); @@ -348,7 +348,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen); int kernel_listen(struct socket *sock, int backlog); int kernel_accept(struct socket *sock, struct socket **newsock, int flags); -int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, +int kernel_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 1666cf6f539e..ebafd96912bb 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -23,11 +23,11 @@ struct sockaddr; struct socket; int inet_release(struct socket *sock); -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); -int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags, int is_sendmsg); -int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); int inet_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index bb4b80c12541..58242b37b47a 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -85,7 +85,7 @@ void sctp_udp_sock_stop(struct net *net); /* * sctp/socket.c */ -int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, +int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); diff --git a/include/net/sock.h b/include/net/sock.h index acbb78c96d69..589fbce77217 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1921,7 +1921,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, * does not implement a particular function. */ int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len); -int sock_no_connect(struct socket *, struct sockaddr *, int, int); +int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h index cf8cc140d68d..c3f4cc206198 100644 --- a/include/net/vsock_addr.h +++ b/include/net/vsock_addr.h @@ -16,7 +16,7 @@ bool vsock_addr_bound(const struct sockaddr_vm *addr); void vsock_addr_unbind(struct sockaddr_vm *addr); bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, const struct sockaddr_vm *other); -int vsock_addr_cast(const struct sockaddr *addr, size_t len, +int vsock_addr_cast(const struct sockaddr_unsized *addr, size_t len, struct sockaddr_vm **out_addr); #endif diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index ef517bb307e2..49d674f5e73a 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -1018,7 +1018,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) } err = READ_ONCE(csocket->ops)->connect(csocket, - (struct sockaddr *)&stor, + (struct sockaddr_unsized *)&stor, sizeof(stor), 0); if (err < 0) { pr_err("%s (%d): problem connecting socket to %s\n", @@ -1058,8 +1058,8 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) return err; } - err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sun_server, - sizeof(struct sockaddr_un) - 1, 0); + err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr_unsized *)&sun_server, + sizeof(struct sockaddr_un) - 1, 0); if (err < 0) { pr_err("%s (%d): problem connecting socket: %s: %d\n", __func__, task_pid_nr(current), addr, err); diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 45db43cde67f..2a01fff46c9d 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1204,7 +1204,7 @@ out: } /* Set the address we talk to */ -static int atalk_connect(struct socket *sock, struct sockaddr *uaddr, +static int atalk_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 62fdf07c53de..8f5e76f5dd9e 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -56,10 +56,10 @@ out: return error; } -static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr, +static int pvc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len, int flags) { - return pvc_bind(sock, (struct sockaddr_unsized *)sockaddr, sockaddr_len); + return pvc_bind(sock, sockaddr, sockaddr_len); } static int pvc_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/atm/svc.c b/net/atm/svc.c index 1906a493c8aa..005964250ecd 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -153,7 +153,7 @@ out: return error; } -static int svc_connect(struct socket *sock, struct sockaddr *sockaddr, +static int svc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len, int flags) { DEFINE_WAIT(wait); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 23c558ff9682..7ebbff2f0020 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1175,7 +1175,7 @@ out: * FIXME: nonblock behaviour looks like it may have a bug. */ static int __must_check ax25_connect(struct socket *sock, - struct sockaddr *uaddr, int addr_len, int flags) + struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; ax25_cb *ax25 = sk_to_ax25(sk), *ax25t; diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 6a7e1b4a8701..243505b89733 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1080,7 +1080,7 @@ done: return err; } -static int iso_sock_connect(struct socket *sock, struct sockaddr *addr, +static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index ca7394d8fa4e..9ee189c815d4 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -178,7 +178,7 @@ done: return err; } -static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, +static int l2cap_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sock *sk = sock->sk; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index d62fd6c57617..57b1dca8141f 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -808,7 +808,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM); addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; - *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK); + *err = kernel_connect(sock, (struct sockaddr_unsized *)&addr, sizeof(addr), O_NONBLOCK); if (*err == 0 || *err == -EINPROGRESS) return s; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 8c8762bbc6de..be6639cd6f59 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -371,7 +371,8 @@ done: return err; } -static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +static int rfcomm_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, + int alen, int flags) { struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; struct sock *sk = sock->sk; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 01d878205e58..7afe65e7ff37 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -639,7 +639,7 @@ done: return err; } -static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +static int sco_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 039dfbd367c9..af218742af5a 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -734,7 +734,7 @@ bad_sol: * o sock->state: holds the SS_* socket state and is updated by connect and * disconnect. */ -static int caif_connect(struct socket *sock, struct sockaddr *uaddr, +static int caif_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/can/bcm.c b/net/can/bcm.c index 5e690a2377e4..7eba8ae01a5b 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1657,7 +1657,7 @@ static int bcm_release(struct socket *sock) return 0; } -static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, +static int bcm_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index a2abedc757d0..6272326dd614 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -535,7 +535,7 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, in return ret; } -static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr, +static int j1939_sk_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f8181acaf870..70b25f4ecba6 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -460,7 +460,7 @@ int ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); con_sock_state_connecting(con); - ret = kernel_connect(sock, (struct sockaddr *)&ss, sizeof(ss), + ret = kernel_connect(sock, (struct sockaddr_unsized *)&ss, sizeof(ss), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", diff --git a/net/core/sock.c b/net/core/sock.c index 1e1ce18bba16..f97a0e958991 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3468,7 +3468,7 @@ int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) } EXPORT_SYMBOL(sock_no_bind); -int sock_no_connect(struct socket *sock, struct sockaddr *saddr, +int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags) { return -EOPNOTSUPP; diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 99ddfad9bb88..b93fd85f248a 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -107,7 +107,7 @@ static int ieee802154_sock_bind(struct socket *sock, struct sockaddr_unsized *ua return sock_no_bind(sock, uaddr, addr_len); } -static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, +static int ieee802154_sock_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; @@ -118,7 +118,7 @@ static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); - return sk->sk_prot->connect(sk, uaddr, addr_len); + return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); } static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index aa43d16e48ff..0844de9ac6a4 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -567,7 +567,7 @@ out: return err; } -int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; @@ -584,14 +584,14 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, return prot->disconnect(sk, flags); if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { - err = prot->pre_connect(sk, uaddr, addr_len); + err = prot->pre_connect(sk, (struct sockaddr *)uaddr, addr_len); if (err) return err; } if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) return -EAGAIN; - return prot->connect(sk, uaddr, addr_len); + return prot->connect(sk, (struct sockaddr *)uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); @@ -623,7 +623,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ -int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags, int is_sendmsg) { struct sock *sk = sock->sk; @@ -671,12 +671,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { - err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + err = sk->sk_prot->pre_connect(sk, (struct sockaddr *)uaddr, addr_len); if (err) goto out; } - err = sk->sk_prot->connect(sk, uaddr, addr_len); + err = sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); if (err < 0) goto out; @@ -741,7 +741,7 @@ sock_error: } EXPORT_SYMBOL(__inet_stream_connect); -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { int err; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a9345aa5a2e5..dee578aad690 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1061,7 +1061,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; - err = __inet_stream_connect(sk->sk_socket, uaddr, + err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 11e5a88c923d..b1f667c52cb2 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -38,7 +38,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->peer_ip; udp_addr.sin_port = cfg->peer_udp_port; - err = kernel_connect(sock, (struct sockaddr *)&udp_addr, + err = kernel_connect(sock, (struct sockaddr_unsized *)&udp_addr, sizeof(udp_addr), 0); if (err < 0) goto error; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index b0d9286b33c8..cef3e0210744 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -52,7 +52,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->peer_udp_port; err = kernel_connect(sock, - (struct sockaddr *)&udp6_addr, + (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr), 0); } if (err < 0) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 3941e32cda69..a4f1df92417d 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -668,7 +668,7 @@ static int iucv_sock_autobind(struct sock *sk) return err; } -static int afiucv_path_connect(struct socket *sock, struct sockaddr *addr) +static int afiucv_path_connect(struct socket *sock, struct sockaddr_unsized *addr) { DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); struct sock *sk = sock->sk; @@ -714,7 +714,7 @@ done: } /* Connect an unconnected socket */ -static int iucv_sock_connect(struct socket *sock, struct sockaddr *addr, +static int iucv_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 4b5e372a5cd4..c4f4a57cd67c 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1513,7 +1513,7 @@ static int l2tp_tunnel_sock_create(struct net *net, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = peer_tunnel_id; err = kernel_connect(sock, - (struct sockaddr *)&ip6_addr, + (struct sockaddr_unsized *)&ip6_addr, sizeof(ip6_addr), 0); if (err < 0) goto out; @@ -1538,7 +1538,7 @@ static int l2tp_tunnel_sock_create(struct net *net, ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->peer_ip; ip_addr.l2tp_conn_id = peer_tunnel_id; - err = kernel_connect(sock, (struct sockaddr *)&ip_addr, + err = kernel_connect(sock, (struct sockaddr_unsized *)&ip_addr, sizeof(ip_addr), 0); if (err < 0) goto out; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 5e12e7ce17d8..ae4543d5597b 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -684,7 +684,7 @@ static struct l2tp_tunnel *pppol2tp_tunnel_get(struct net *net, /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ -static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, +static int pppol2tp_connect(struct socket *sock, struct sockaddr_unsized *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index e5bb0c0d708c..59d593bb5d18 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -477,7 +477,7 @@ out: * This function will autobind if user did not previously call bind. * Returns: 0 upon success, negative otherwise. */ -static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr, +static int llc_ui_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addrlen, int flags) { struct sock *sk = sock->sk; diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 5b1ef50637b7..209a963112e3 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -128,7 +128,7 @@ out_release: /* Used to set a specific peer prior to bind. Not used for outbound * connections (Tag Owner set) since MCTP is a datagram protocol. */ -static int mctp_connect(struct socket *sock, struct sockaddr *addr, +static int mctp_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen, int flags) { struct sock *sk = sock->sk; diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index 953d41902771..35f6be814567 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -279,7 +279,7 @@ void mctp_test_bind_run(struct kunit *test, addr.smctp_addr.s_addr = setup->peer_addr; /* connect() type must match bind() type */ addr.smctp_type = setup->bind_type; - rc = kernel_connect(*sock, (struct sockaddr *)&addr, + rc = kernel_connect(*sock, (struct sockaddr_unsized *)&addr, sizeof(addr), 0); KUNIT_EXPECT_EQ(test, rc, 0); } @@ -292,5 +292,6 @@ void mctp_test_bind_run(struct kunit *test, addr.smctp_type = setup->bind_type; *ret_bind_errno = - kernel_bind(*sock, (struct sockaddr *)&addr, sizeof(addr)); + kernel_bind(*sock, (struct sockaddr_unsized *)&addr, + sizeof(addr)); } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d90237bf433c..30961b3d1702 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1680,7 +1680,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); - err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); + err = kernel_connect(sf, (struct sockaddr_unsized *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR); pr_debug("msk=%p local=%d remote=%d connect error: %d\n", diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index d8c089ef387c..5a0c6f42bd8f 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1501,7 +1501,7 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id, } get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); - result = kernel_connect(sock, (struct sockaddr *)&mcast_addr, + result = kernel_connect(sock, (struct sockaddr_unsized *)&mcast_addr, salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 18490a56edd0..8e5151f0c6e4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1054,7 +1054,7 @@ unlock: return err; } -static int netlink_connect(struct socket *sock, struct sockaddr *addr, +static int netlink_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { int err = 0; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 33468124d53d..5ed1a71ceec1 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -632,8 +632,8 @@ static int nr_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr return 0; } -static int nr_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) +static int nr_connect(struct socket *sock, struct sockaddr_unsized *uaddr, + int addr_len, int flags) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 26e6ceb48a82..f1be1e84f665 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -648,7 +648,7 @@ out: return err; } -static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, +static int llcp_sock_connect(struct socket *sock, struct sockaddr_unsized *_addr, int len, int flags) { struct sock *sk = sock->sk; diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 5125392bb68e..b049022399ae 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -73,7 +73,7 @@ static int rawsock_release(struct socket *sock) return 0; } -static int rawsock_connect(struct socket *sock, struct sockaddr *_addr, +static int rawsock_connect(struct socket *sock, struct sockaddr_unsized *_addr, int len, int flags) { struct sock *sk = sock->sk; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 478b02647733..9391378083a4 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -214,8 +214,8 @@ static int pn_socket_autobind(struct socket *sock) return 0; /* socket was already bound */ } -static int pn_socket_connect(struct socket *sock, struct sockaddr *addr, - int len, int flags) +static int pn_socket_connect(struct socket *sock, struct sockaddr_unsized *addr, + int len, int flags) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); @@ -252,7 +252,7 @@ static int pn_socket_connect(struct socket *sock, struct sockaddr *addr, pn->resource = pn_sockaddr_get_resource(spn); sock->state = SS_CONNECTING; - err = sk->sk_prot->connect(sk, addr, len); + err = sk->sk_prot->connect(sk, (struct sockaddr *)addr, len); if (err) { sock->state = SS_UNCONNECTED; pn->dobject = 0; diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 00bd3dd9f0f9..dab839f61ee9 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -1084,7 +1084,7 @@ out: return rc; } -static int qrtr_connect(struct socket *sock, struct sockaddr *saddr, +static int qrtr_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 4a7217fbeab6..b396c673dfaf 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -533,7 +533,7 @@ out: } -static int rds_connect(struct socket *sock, struct sockaddr *uaddr, +static int rds_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 1eff3b03ab77..92891b0d224d 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -173,7 +173,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) * own the socket */ rds_tcp_set_callbacks(sock, cp); - ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK); + ret = kernel_connect(sock, (struct sockaddr_unsized *)addr, addrlen, O_NONBLOCK); rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 47369eab5aec..fd67494f2815 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -765,7 +765,8 @@ out_release: return err; } -static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) +static int rose_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, + int flags) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 245f37a74394..0c2c68c4b07e 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -481,7 +481,7 @@ EXPORT_SYMBOL(rxrpc_kernel_set_notifications); * - this just targets it at a specific destination; no actual connection * negotiation takes place */ -static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, +static int rxrpc_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)addr; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ac737e60829b..940abbced191 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4820,7 +4820,7 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr, return err; } -int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, +int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { if (addr_len < sizeof(uaddr->sa_family)) @@ -4829,7 +4829,7 @@ int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, if (uaddr->sa_family == AF_UNSPEC) return -EOPNOTSUPP; - return sctp_connect(sock->sk, uaddr, addr_len, flags); + return sctp_connect(sock->sk, (struct sockaddr *)uaddr, addr_len, flags); } /* Only called when shutdown a listening SCTP socket. */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index be18ab08f15d..0ef3e16a8517 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1642,7 +1642,7 @@ out: release_sock(&smc->sk); } -int smc_connect(struct socket *sock, struct sockaddr *addr, +int smc_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sock *sk = sock->sk; @@ -1694,7 +1694,7 @@ int smc_connect(struct socket *sock, struct sockaddr *addr, rc = -EALREADY; goto out; } - rc = kernel_connect(smc->clcsock, addr, alen, flags); + rc = kernel_connect(smc->clcsock, (struct sockaddr_unsized *)addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; diff --git a/net/smc/smc.h b/net/smc/smc.h index a008dbe6d6f6..9e6af72784ba 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -44,7 +44,7 @@ void smc_release_cb(struct sock *sk); int smc_release(struct socket *sock); int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); -int smc_connect(struct socket *sock, struct sockaddr *addr, +int smc_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags); int smc_accept(struct socket *sock, struct socket *new_sock, struct proto_accept_arg *arg); diff --git a/net/socket.c b/net/socket.c index aaefb2e519a7..101a7ed574e7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2099,8 +2099,8 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, if (err) goto out; - err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address, - addrlen, sock->file->f_flags | file_flags); + err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)address, + addrlen, sock->file->f_flags | file_flags); out: return err; } @@ -3662,14 +3662,14 @@ EXPORT_SYMBOL(kernel_accept); * Returns 0 or an error code. */ -int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, +int kernel_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen, int flags) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); - return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address, + return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)&address, addrlen, flags); } EXPORT_SYMBOL(kernel_connect); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 318ee24ad900..58442ae1c2da 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1474,7 +1474,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, goto out_release; } - err = kernel_connect(sock, sap, salen, 0); + err = kernel_connect(sock, (struct sockaddr_unsized *)sap, salen, 0); if (err < 0) { dprintk("RPC: can't connect UDP socket (%d)\n", err); goto out_release; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 95732a45b059..2e1fe6013361 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2005,7 +2005,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, xs_stream_start_connect(transport); - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); + return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), xprt->addrlen, 0); } /** @@ -2405,7 +2405,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* Tell the socket layer to start connecting... */ set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); + return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), + xprt->addrlen, O_NONBLOCK); } /** diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3903a97ada7d..817b07d95a91 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2565,7 +2565,7 @@ static bool tipc_sockaddr_is_sane(struct sockaddr_tipc *addr) * * Return: 0 on success, errno otherwise */ -static int tipc_connect(struct socket *sock, struct sockaddr *dest, +static int tipc_connect(struct socket *sock, struct sockaddr_unsized *dest, int destlen, int flags) { struct sock *sk = sock->sk; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 788775f0eea7..3b44cadaed96 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -844,7 +844,7 @@ out: static int unix_release(struct socket *); static int unix_bind(struct socket *, struct sockaddr_unsized *, int); -static int unix_stream_connect(struct socket *, struct sockaddr *, +static int unix_stream_connect(struct socket *, struct sockaddr_unsized *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); @@ -866,7 +866,7 @@ static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); -static int unix_dgram_connect(struct socket *, struct sockaddr *, +static int unix_dgram_connect(struct socket *, struct sockaddr_unsized *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, @@ -1512,7 +1512,7 @@ static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) unix_state_unlock(sk2); } -static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, +static int unix_dgram_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; @@ -1631,7 +1631,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo) return timeo; } -static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, +static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 0e5609e7284b..72bb6b7ed386 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -995,7 +995,7 @@ vsock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) sk = sock->sk; - if (vsock_addr_cast((struct sockaddr *)addr, addr_len, &vm_addr) != 0) + if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) return -EINVAL; lock_sock(sk); @@ -1328,7 +1328,7 @@ out: } static int vsock_dgram_connect(struct socket *sock, - struct sockaddr *addr, int addr_len, int flags) + struct sockaddr_unsized *addr, int addr_len, int flags) { int err; struct sock *sk; @@ -1528,7 +1528,7 @@ static void vsock_connect_timeout(struct work_struct *work) sock_put(sk); } -static int vsock_connect(struct socket *sock, struct sockaddr *addr, +static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { int err; diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c index 223b9660a759..a986aa6fff9b 100644 --- a/net/vmw_vsock/vsock_addr.c +++ b/net/vmw_vsock/vsock_addr.c @@ -57,7 +57,7 @@ bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, } EXPORT_SYMBOL_GPL(vsock_addr_equals_addr); -int vsock_addr_cast(const struct sockaddr *addr, +int vsock_addr_cast(const struct sockaddr_unsized *addr, size_t len, struct sockaddr_vm **out_addr) { if (len < sizeof(**out_addr)) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index ca8006d8f792..af8762b24039 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -743,7 +743,7 @@ static int x25_wait_for_connection_establishment(struct sock *sk) return rc; } -static int x25_connect(struct socket *sock, struct sockaddr *uaddr, +static int x25_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/samples/qmi/qmi_sample_client.c b/samples/qmi/qmi_sample_client.c index b27d861f354f..d1814582319b 100644 --- a/samples/qmi/qmi_sample_client.c +++ b/samples/qmi/qmi_sample_client.c @@ -468,7 +468,7 @@ static int qmi_sample_probe(struct platform_device *pdev) return ret; sq = dev_get_platdata(&pdev->dev); - ret = kernel_connect(sample->qmi.sock, (struct sockaddr *)sq, + ret = kernel_connect(sample->qmi.sock, (struct sockaddr_unsized *)sq, sizeof(*sq), 0); if (ret < 0) { pr_err("failed to connect to remote service port\n"); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 0497b5dea25c..8eeebaa951f0 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -900,7 +900,7 @@ __bpf_kfunc int bpf_kfunc_call_kernel_connect(struct addr_args *args) goto out; } - err = kernel_connect(sock, (struct sockaddr *)&args->addr, + err = kernel_connect(sock, (struct sockaddr_unsized *)&args->addr, args->addrlen, 0); out: mutex_unlock(&sock_lock); -- cgit v1.2.3 From 3d39d34146f2b38127eadf36a0513e130eaa7eec Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:12 -0800 Subject: net: Remove struct sockaddr from net.h Now that struct sockaddr is no longer used by net.h, remove it. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-4-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/net.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index db6bc997ca5b..f58b38ab37f8 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -148,7 +148,6 @@ typedef struct { struct vm_area_struct; struct page; -struct sockaddr; struct msghdr; struct module; struct sk_buff; -- cgit v1.2.3 From 8116d803e7f8f20bf00ce23ff8bd0baab41e1635 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:14 -0800 Subject: bpf: Convert cgroup sockaddr filters to use sockaddr_unsized consistently Update BPF cgroup sockaddr filtering infrastructure to use sockaddr_unsized consistently throughout the call chain, removing redundant explicit casts from callers. No binary changes expected. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-6-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/bpf-cgroup.h | 17 ++++++++++------- kernel/bpf/cgroup.c | 4 ++-- net/ipv4/af_inet.c | 4 ++-- 3 files changed, 14 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a7fb4f46974f..d1eb5c7729cb 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -120,7 +120,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, @@ -238,8 +238,9 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ - atype, NULL, NULL); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, \ + (struct sockaddr_unsized *)uaddr, uaddrlen, \ + atype, NULL, NULL); \ __ret; \ }) @@ -248,8 +249,9 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ - atype, t_ctx, NULL); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, \ + (struct sockaddr_unsized *)uaddr, uaddrlen, \ + atype, t_ctx, NULL); \ release_sock(sk); \ } \ __ret; \ @@ -266,8 +268,9 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ - atype, NULL, &__flags); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, \ + (struct sockaddr_unsized *)uaddr, uaddrlen, \ + atype, NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ *bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE; \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 248f517d66d0..497aedc9afa1 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1665,7 +1665,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * returned value != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, @@ -1673,7 +1673,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, { struct bpf_sock_addr_kern ctx = { .sk = sk, - .uaddr = uaddr, + .uaddr = (struct sockaddr *)uaddr, .t_ctx = t_ctx, }; struct sockaddr_storage unspec; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d5ac089356eb..a31b94ce8968 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -834,7 +834,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, } sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, + BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len, CGROUP_INET4_GETPEERNAME); } else { __be32 addr = inet->inet_rcv_saddr; @@ -842,7 +842,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, + BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len, CGROUP_INET4_GETSOCKNAME); } release_sock(sk); -- cgit v1.2.3 From c1a799eef62b8c3298a4d82753fe0f2a448e5e4f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:15 -0800 Subject: bpf: Convert bpf_sock_addr_kern "uaddr" to sockaddr_unsized Change struct bpf_sock_addr_kern to use sockaddr_unsized for the "uaddr" field instead of sockaddr. This improves type safety in the BPF cgroup socket address filtering code. The casting in __cgroup_bpf_run_filter_sock_addr() is updated to match the new type, removing an unnecessary cast in the initialization and updating the conditional assignment to use the appropriate sockaddr_unsized cast. Additionally rename the "unspec" variable to "storage" to better align with its usage. No binary changes expected. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-7-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/filter.h | 2 +- kernel/bpf/cgroup.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index f5c859b8131a..e116de7edc58 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1515,7 +1515,7 @@ static inline int bpf_tell_extensions(void) struct bpf_sock_addr_kern { struct sock *sk; - struct sockaddr *uaddr; + struct sockaddr_unsized *uaddr; /* Temporary "register" to make indirect stores to nested structures * defined above. We need three registers to make such a store, but * only two (src and dst) are available at convert_ctx_access time diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 497aedc9afa1..69988af44b37 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1673,10 +1673,10 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, { struct bpf_sock_addr_kern ctx = { .sk = sk, - .uaddr = (struct sockaddr *)uaddr, + .uaddr = uaddr, .t_ctx = t_ctx, }; - struct sockaddr_storage unspec; + struct sockaddr_storage storage; struct cgroup *cgrp; int ret; @@ -1688,8 +1688,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, return 0; if (!ctx.uaddr) { - memset(&unspec, 0, sizeof(unspec)); - ctx.uaddr = (struct sockaddr *)&unspec; + memset(&storage, 0, sizeof(storage)); + ctx.uaddr = (struct sockaddr_unsized *)&storage; ctx.uaddrlen = 0; } else { ctx.uaddrlen = *uaddrlen; -- cgit v1.2.3 From 2b5e9f9b7e414c5eeb20dd7a7b80816ff55cf57b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:16 -0800 Subject: net: Convert struct sockaddr to fixed-size "sa_data[14]" Revert struct sockaddr from flexible array to fixed 14-byte "sa_data", to solve over 36,000 -Wflex-array-member-not-at-end warnings, since struct sockaddr is embedded within many network structs. With socket/proto sockaddr-based internal APIs switched to use struct sockaddr_unsized, there should be no more uses of struct sockaddr that depend on reading beyond the end of struct sockaddr::sa_data that might trigger bounds checking. Comparing an x86_64 "allyesconfig" vmlinux build before and after this patch showed no new "ud1" instructions from CONFIG_UBSAN_BOUNDS nor any new "field-spanning" memcpy CONFIG_FORTIFY_SOURCE instrumentations. Cc: Gustavo A. R. Silva Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-8-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 6 ++---- net/core/dev.c | 2 +- net/core/dev_ioctl.c | 2 +- net/ipv4/arp.c | 2 +- net/packet/af_packet.c | 10 +++++----- tools/perf/trace/beauty/include/linux/socket.h | 5 +---- 6 files changed, 11 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 7b1a01be29da..944027f9765e 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -32,12 +32,10 @@ typedef __kernel_sa_family_t sa_family_t; * 1003.1g requires sa_family_t and that sa_data is char. */ +/* Deprecated for in-kernel use. Use struct sockaddr_unsized instead. */ struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ - union { - char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ - DECLARE_FLEX_ARRAY(char, sa_data); - }; + char sa_data[14]; /* 14 bytes of protocol address */ }; /** diff --git a/net/core/dev.c b/net/core/dev.c index ba39146bbd25..537aa43edff0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9973,7 +9973,7 @@ DECLARE_RWSEM(dev_addr_sem); /* "sa" is a true struct sockaddr with limited "sa_data" member. */ int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { - size_t size = sizeof(sa->sa_data_min); + size_t size = sizeof(sa->sa_data); struct net_device *dev; int ret = 0; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index ad54b12d4b4c..b3ce0fb24a69 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -596,7 +596,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof(ifr->ifr_hwaddr.sa_data_min), + min(sizeof(ifr->ifr_hwaddr.sa_data), (size_t)dev->addr_len)); netdev_lock_ops(dev); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f3bfecf8a234..7f3863daaa40 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1189,7 +1189,7 @@ static int arp_req_get(struct net *net, struct arpreq *r) read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, - min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); + min(dev->addr_len, sizeof(r->arp_ha.sa_data))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index fccad2a529cc..494d628d10a5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3284,7 +3284,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr, { struct sock *sk = sock->sk; struct sockaddr *sa = (struct sockaddr *)uaddr; - char name[sizeof(sa->sa_data_min) + 1]; + char name[sizeof(sa->sa_data) + 1]; /* * Check legality @@ -3295,8 +3295,8 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr, /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ - memcpy(name, sa->sa_data, sizeof(sa->sa_data_min)); - name[sizeof(sa->sa_data_min)] = 0; + memcpy(name, sa->sa_data, sizeof(sa->sa_data)); + name[sizeof(sa->sa_data)] = 0; return packet_do_bind(sk, name, 0, 0); } @@ -3581,11 +3581,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; - memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); + memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) - strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); + strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); return sizeof(*uaddr); diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 3b262487ec06..77d7c59f5d8b 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -34,10 +34,7 @@ typedef __kernel_sa_family_t sa_family_t; struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ - union { - char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ - DECLARE_FLEX_ARRAY(char, sa_data); - }; + char sa_data[14]; /* 14 bytes of protocol address */ }; struct linger { -- cgit v1.2.3 From 7c5b184db7145fd417785377337bd15c4fe1d0f4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:29:59 -0400 Subject: genpt: Generic Page Table base API The generic API is intended to be separated from the implementation of page table algorithms. It contains only accessors for walking and manipulating the table and helpers that are useful for building an implementation. Memory management is not in the generic API, but part of the implementation. Using a multi-compilation approach the implementation module would include headers in this order: common.h defs_FMT.h pt_defs.h FMT.h pt_common.h IMPLEMENTATION.h Where each compilation unit would have a combination of FMT and IMPLEMENTATION to produce a per-format per-implementation module. The API is designed so that the format headers have minimal logic, and default implementations are provided if the format doesn't include one. Generally formats provide their code via an inline function using the pattern: static inline FMTpt_XX(..) {} #define pt_XX FMTpt_XX The common code then enforces a function signature so that there is no drift in function arguments, or accidental polymorphic functions (as has been slightly troublesome in mm). Use of function-like #defines are avoided in the format even though many of the functions are small enough. Provide kdocs for the API surface. This is enough to implement the 8 initial format variations with all of their features: * Entries comprised of contiguous blocks of IO PTEs for larger page sizes (AMDv1, ARMv8) * Multi-level tables, up to 6 levels. Runtime selected top level * The size of the top table level can be selected at runtime (ARM's concatenated tables) * The number of levels in the table can optionally increase dynamically during map (AMDv1) * Optional leaf entries at any level * 32 bit/64 bit virtual and output addresses, using every bit * Sign extended addressing (x86) * Dirty tracking A basic simple format takes about 200 lines to declare the require inline functions. Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- .clang-format | 1 + drivers/iommu/Kconfig | 2 + drivers/iommu/generic_pt/Kconfig | 20 + drivers/iommu/generic_pt/pt_common.h | 358 ++++++++++++++++ drivers/iommu/generic_pt/pt_defs.h | 329 +++++++++++++++ drivers/iommu/generic_pt/pt_fmt_defaults.h | 233 +++++++++++ drivers/iommu/generic_pt/pt_iter.h | 636 +++++++++++++++++++++++++++++ drivers/iommu/generic_pt/pt_log2.h | 122 ++++++ include/linux/generic_pt/common.h | 135 ++++++ 9 files changed, 1836 insertions(+) create mode 100644 drivers/iommu/generic_pt/Kconfig create mode 100644 drivers/iommu/generic_pt/pt_common.h create mode 100644 drivers/iommu/generic_pt/pt_defs.h create mode 100644 drivers/iommu/generic_pt/pt_fmt_defaults.h create mode 100644 drivers/iommu/generic_pt/pt_iter.h create mode 100644 drivers/iommu/generic_pt/pt_log2.h create mode 100644 include/linux/generic_pt/common.h (limited to 'include/linux') diff --git a/.clang-format b/.clang-format index f371a13b4d19..9e6a9177f8fb 100644 --- a/.clang-format +++ b/.clang-format @@ -415,6 +415,7 @@ ForEachMacros: - 'for_each_prop_dlc_cpus' - 'for_each_prop_dlc_platforms' - 'for_each_property_of_node' + - 'for_each_pt_level_entry' - 'for_each_rdt_resource' - 'for_each_reg' - 'for_each_reg_filtered' diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 70d29b14d851..c9ae3221cd6f 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -384,3 +384,5 @@ config SPRD_IOMMU Say Y here if you want to use the multimedia devices listed above. endif # IOMMU_SUPPORT + +source "drivers/iommu/generic_pt/Kconfig" diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig new file mode 100644 index 000000000000..fb0f431ddba0 --- /dev/null +++ b/drivers/iommu/generic_pt/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig GENERIC_PT + bool "Generic Radix Page Table" + help + Generic library for building radix tree page tables. + + Generic PT provides a set of HW page table formats and a common + set of APIs to work with them. + +if GENERIC_PT +config DEBUG_GENERIC_PT + bool "Extra debugging checks for GENERIC_PT" + help + Enable extra run time debugging checks for GENERIC_PT code. This + incurs a runtime cost and should not be enabled for production + kernels. + + The kunit tests require this to be enabled to get full coverage. +endif diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h new file mode 100644 index 000000000000..f64f800725db --- /dev/null +++ b/drivers/iommu/generic_pt/pt_common.h @@ -0,0 +1,358 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * This header is included after the format. It contains definitions + * that build on the format definitions to create the basic format API. + * + * The format API is listed here, with kdocs. The functions without bodies are + * implemented in the format using the pattern: + * static inline FMTpt_XXX(..) {..} + * #define pt_XXX FMTpt_XXX + * + * If the format doesn't implement a function then pt_fmt_defaults.h can provide + * a generic version. + * + * The routines marked "@pts: Entry to query" operate on the entire contiguous + * entry and can be called with a pts->index pointing to any sub item that makes + * up that entry. + * + * The header order is: + * pt_defs.h + * FMT.h + * pt_common.h + */ +#ifndef __GENERIC_PT_PT_COMMON_H +#define __GENERIC_PT_PT_COMMON_H + +#include "pt_defs.h" +#include "pt_fmt_defaults.h" + +/** + * pt_attr_from_entry() - Convert the permission bits back to attrs + * @pts: Entry to convert from + * @attrs: Resulting attrs + * + * Fill in the attrs with the permission bits encoded in the current leaf entry. + * The attrs should be usable with pt_install_leaf_entry() to reconstruct the + * same entry. + */ +static inline void pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs); + +/** + * pt_can_have_leaf() - True if the current level can have an OA entry + * @pts: The current level + * + * True if the current level can support pt_install_leaf_entry(). A leaf + * entry produce an OA. + */ +static inline bool pt_can_have_leaf(const struct pt_state *pts); + +/** + * pt_can_have_table() - True if the current level can have a lower table + * @pts: The current level + * + * Every level except 0 is allowed to have a lower table. + */ +static inline bool pt_can_have_table(const struct pt_state *pts) +{ + /* No further tables at level 0 */ + return pts->level > 0; +} + +/** + * pt_clear_entries() - Make entries empty (non-present) + * @pts: Starting table index + * @num_contig_lg2: Number of contiguous items to clear + * + * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY + * and does not have any effect on table walking. The starting index must be + * aligned to num_contig_lg2. + */ +static inline void pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2); + +/** + * pt_entry_make_write_dirty() - Make an entry dirty + * @pts: Table entry to change + * + * Make pt_entry_is_write_dirty() return true for this entry. This can be called + * asynchronously with any other table manipulation under a RCU lock and must + * not corrupt the table. + */ +static inline bool pt_entry_make_write_dirty(struct pt_state *pts); + +/** + * pt_entry_make_write_clean() - Make the entry write clean + * @pts: Table entry to change + * + * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will + * eventually be notified of this change via a TLB flush, which is the point + * that the HW must become synchronized. Any "write dirty" prior to the TLB + * flush can be lost, but once the TLB flush completes all writes must make + * their entries write dirty. + * + * The format should alter the entry in a way that is compatible with any + * concurrent update from HW. The entire contiguous entry is changed. + */ +static inline void pt_entry_make_write_clean(struct pt_state *pts); + +/** + * pt_entry_is_write_dirty() - True if the entry has been written to + * @pts: Entry to query + * + * "write dirty" means that the HW has written to the OA translated + * by this entry. If the entry is contiguous then the consolidated + * "write dirty" for all the items must be returned. + */ +static inline bool pt_entry_is_write_dirty(const struct pt_state *pts); + +/** + * pt_dirty_supported() - True if the page table supports dirty tracking + * @common: Page table to query + */ +static inline bool pt_dirty_supported(struct pt_common *common); + +/** + * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry + * @pts: Entry to query + * + * Return the number of contiguous items this leaf entry spans. If the entry + * is single item it returns ilog2(1). + */ +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts); + +/** + * pt_entry_oa() - Output Address for this leaf entry + * @pts: Entry to query + * + * Return the output address for the start of the entry. If the entry + * is contiguous this returns the same value for each sub-item. I.e.:: + * + * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0 + * + * See pt_item_oa(). The format should implement one of these two functions + * depending on how it stores the OAs in the table. + */ +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts); + +/** + * pt_entry_oa_lg2sz() - Return the size of an OA entry + * @pts: Entry to query + * + * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise + * it returns the total VA/OA size of the entire contiguous entry. + */ +static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts) +{ + return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts); +} + +/** + * pt_entry_oa_exact() - Return the complete OA for an entry + * @pts: Entry to query + * + * During iteration the first entry could have a VA with an offset from the + * natural start of the entry. Return the exact OA including the pts's VA + * offset. + */ +static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts) +{ + return _pt_entry_oa_fast(pts) | + log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts)); +} + +/** + * pt_full_va_prefix() - The top bits of the VA + * @common: Page table to query + * + * This is usually 0, but some formats have their VA space going downward from + * PT_VADDR_MAX, and will return that instead. This value must always be + * adjusted by struct pt_common max_vasz_lg2. + */ +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common); + +/** + * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry + * @common: Page table to query + * + * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT). + * This is useful to create optimized paths for common cases of PAGE_SIZE + * mappings. + */ +static inline bool pt_has_system_page_size(const struct pt_common *common); + +/** + * pt_install_leaf_entry() - Write a leaf entry to the table + * @pts: Table index to change + * @oa: Output Address for this leaf + * @oasz_lg2: Size in VA/OA for this leaf + * @attrs: Attributes to modify the entry + * + * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates + * the VA indicated by pts to the given OA. + * + * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz(). + * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2. + * + * This must not be called if pt_can_have_leaf() == false. Contiguous sizes + * not indicated by pt_possible_sizes() must not be specified. + */ +static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs); + +/** + * pt_install_table() - Write a table entry to the table + * @pts: Table index to change + * @table_pa: CPU physical address of the lower table's memory + * @attrs: Attributes to modify the table index + * + * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa + * is the table at pts->level - 1. This is done by cmpxchg so pts must have the + * current entry loaded. The pts is updated with the installed entry. + * + * This must not be called if pt_can_have_table() == false. + * + * Returns: true if the table was installed successfully. + */ +static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs); + +/** + * pt_item_oa() - Output Address for this leaf item + * @pts: Item to query + * + * Return the output address for this item. If the item is part of a contiguous + * entry it returns the value of the OA for this individual sub item. + * + * See pt_entry_oa(). The format should implement one of these two functions + * depending on how it stores the OA's in the table. + */ +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts); + +/** + * pt_load_entry_raw() - Read from the location pts points at into the pts + * @pts: Table index to load + * + * Return the type of entry that was loaded. pts->entry will be filled in with + * the entry's content. See pt_load_entry() + */ +static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts); + +/** + * pt_max_oa_lg2() - Return the maximum OA the table format can hold + * @common: Page table to query + * + * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the + * OA. This is the absolute maximum address the table can hold. struct pt_common + * max_oasz_lg2 sets a lower dynamic maximum based on HW capability. + */ +static inline unsigned int +pt_max_oa_lg2(const struct pt_common *common); + +/** + * pt_num_items_lg2() - Return the number of items in this table level + * @pts: The current level + * + * The number of items in a table level defines the number of bits this level + * decodes from the VA. This function is not called for the top level, + * so it does not need to compute a special value for the top case. The + * result for the top is based on pt_common max_vasz_lg2. + * + * The value is used as part of determining the table indexes via the + * equation:: + * + * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2()) + */ +static inline unsigned int pt_num_items_lg2(const struct pt_state *pts); + +/** + * pt_pgsz_lg2_to_level - Return the level that maps the page size + * @common: Page table to query + * @pgsize_lg2: Log2 page size + * + * Returns the table level that will map the given page size. The page + * size must be part of the pt_possible_sizes() for some level. + */ +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, + unsigned int pgsize_lg2); + +/** + * pt_possible_sizes() - Return a bitmap of possible output sizes at this level + * @pts: The current level + * + * Each level has a list of possible output sizes that can be installed as + * leaf entries. If pt_can_have_leaf() is false returns zero. + * + * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating + * that a non-contiguous single item leaf entry is supported. The following + * pt_num_items_lg2() number of bits can be set indicating contiguous entries + * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be + * set, contiguous entries cannot span the entire table. + * + * The OR of pt_possible_sizes() of all levels is the typical bitmask of all + * supported sizes in the entire table. + */ +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts); + +/** + * pt_table_item_lg2sz() - Size of a single item entry in this table level + * @pts: The current level + * + * The size of the item specifies how much VA and OA a single item occupies. + * + * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous + * entries. + */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +/** + * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table + * @pts: The current level + * + * Return the size of VA decoded by the entire table level. + */ +static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts) +{ + if (pts->range->top_level == pts->level) + return pts->range->max_vasz_lg2; + return min_t(unsigned int, pts->range->common->max_vasz_lg2, + pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts)); +} + +/** + * pt_table_pa() - Return the CPU physical address of the table entry + * @pts: Entry to query + * + * This is only ever called on PT_ENTRY_TABLE entries. Must return the same + * value passed to pt_install_table(). + */ +static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts); + +/** + * pt_table_ptr() - Return a CPU pointer for a table item + * @pts: Entry to query + * + * Same as pt_table_pa() but returns a CPU pointer. + */ +static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts) +{ + return __va(pt_table_pa(pts)); +} + +/** + * pt_load_entry() - Read from the location pts points at into the pts + * @pts: Table index to load + * + * Set the type of entry that was loaded. pts->entry and pts->table_lower + * will be filled in with the entry's content. + */ +static inline void pt_load_entry(struct pt_state *pts) +{ + pts->type = pt_load_entry_raw(pts); + if (pts->type == PT_ENTRY_TABLE) + pts->table_lower = pt_table_ptr(pts); +} +#endif diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h new file mode 100644 index 000000000000..819057de50d8 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_defs.h @@ -0,0 +1,329 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * This header is included before the format. It contains definitions + * that are required to compile the format. The header order is: + * pt_defs.h + * fmt_XX.h + * pt_common.h + */ +#ifndef __GENERIC_PT_DEFS_H +#define __GENERIC_PT_DEFS_H + +#include + +#include +#include +#include +#include +#include +#include +#include "pt_log2.h" + +/* Header self-compile default defines */ +#ifndef pt_write_attrs +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; +#endif + +struct pt_table_p; + +enum { + PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX, + PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32, + PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX, + PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32, +}; + +/* + * The format instantiation can have features wired off or on to optimize the + * code gen. Supported features are just a reflection of what the current set of + * kernel users want to use. + */ +#ifndef PT_SUPPORTED_FEATURES +#define PT_SUPPORTED_FEATURES 0 +#endif + +/* + * When in debug mode we compile all formats with all features. This allows the + * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or + * FULL_VA. + */ +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +enum { + PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES, + PT_DEBUG_SUPPORTED_FEATURES = + UINT_MAX & + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ? + BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) : + BIT(PT_FEAT_SIGN_EXTEND)), +}; +#undef PT_SUPPORTED_FEATURES +#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES +#endif + +#ifndef PT_FORCE_ENABLED_FEATURES +#define PT_FORCE_ENABLED_FEATURES 0 +#endif + +/** + * DOC: Generic Page Table Language + * + * Language used in Generic Page Table + * VA + * The input address to the page table, often the virtual address. + * OA + * The output address from the page table, often the physical address. + * leaf + * An entry that results in an output address. + * start/end + * An half-open range, e.g. [0,0) refers to no VA. + * start/last + * An inclusive closed range, e.g. [0,0] refers to the VA 0 + * common + * The generic page table container struct pt_common + * level + * Level 0 is always a table of only leaves with no futher table pointers. + * Increasing levels increase the size of the table items. The least + * significant VA bits used to index page tables are used to index the Level + * 0 table. The various labels for table levels used by HW descriptions are + * not used. + * top_level + * The inclusive highest level of the table. A two-level table + * has a top level of 1. + * table + * A linear array of translation items for that level. + * index + * The position in a table of an element: item = table[index] + * item + * A single index in a table + * entry + * A single logical element in a table. If contiguous pages are not + * supported then item and entry are the same thing, otherwise entry refers + * to all the items that comprise a single contiguous translation. + * item/entry_size + * The number of bytes of VA the table index translates for. + * If the item is a table entry then the next table covers + * this size. If the entry translates to an output address then the + * full OA is: OA | (VA % entry_size) + * contig_count + * The number of consecutive items fused into a single entry. + * item_size * contig_count is the size of that entry's translation. + * lg2 + * Indicates the value is encoded as log2, i.e. 1<table)) + +/* + * Try to install a new table pointer. The locking methodology requires this to + * be atomic (multiple threads can race to install a pointer). The losing + * threads will fail the atomic and return false. They should free any memory + * and reparse the table level again. + */ +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) +static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry) +{ + u64 *entryp = pt_cur_table(pts, u64) + pts->index; + u64 old_entry = pts->entry; + bool ret; + + /* + * Ensure the zero'd table content itself is visible before its PTE can + * be. release is a NOP on !SMP, but the HW is still doing an acquire. + */ + if (!IS_ENABLED(CONFIG_SMP)) + dma_wmb(); + ret = try_cmpxchg64_release(entryp, &old_entry, table_entry); + if (ret) + pts->entry = table_entry; + return ret; +} +#endif + +static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry) +{ + u32 *entryp = pt_cur_table(pts, u32) + pts->index; + u32 old_entry = pts->entry; + bool ret; + + /* + * Ensure the zero'd table content itself is visible before its PTE can + * be. release is a NOP on !SMP, but the HW is still doing an acquire. + */ + if (!IS_ENABLED(CONFIG_SMP)) + dma_wmb(); + ret = try_cmpxchg_release(entryp, &old_entry, table_entry); + if (ret) + pts->entry = table_entry; + return ret; +} + +#define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr)) + +static inline bool pt_feature(const struct pt_common *common, + unsigned int feature_nr) +{ + if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr)) + return true; + if (!PT_SUPPORTED_FEATURE(feature_nr)) + return false; + return common->features & BIT(feature_nr); +} + +static inline bool pts_feature(const struct pt_state *pts, + unsigned int feature_nr) +{ + return pt_feature(pts->range->common, feature_nr); +} + +/* + * PT_WARN_ON is used for invariants that the kunit should be checking can't + * happen. + */ +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +#define PT_WARN_ON WARN_ON +#else +static inline bool PT_WARN_ON(bool condition) +{ + return false; +} +#endif + +/* These all work on the VA type */ +#define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2) +#define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2) +#define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2) +#define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2) +#define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2) +#define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2) +#define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2) +#define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2) +#define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2) +#define vaffs(a) ffs_t(pt_vaddr_t, a) +#define vafls(a) fls_t(pt_vaddr_t, a) +#define vaffz(a) ffz_t(pt_vaddr_t, a) + +/* + * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and + * generate a useful defined result. The non-fva versions will malfunction at + * this extreme. + */ +static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return 0; + return log2_div_t(pt_vaddr_t, a, b_lg2); +} + +static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return a; + return log2_mod_t(pt_vaddr_t, a, b_lg2); +} + +static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b, + unsigned int c_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2) + return true; + return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2); +} + +static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val, + unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return val; + return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2); +} + +static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return PT_VADDR_MAX; + return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2); +} + +/* These all work on the OA type */ +#define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2) +#define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2) +#define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2) +#define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2) +#define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2) +#define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2) +#define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2) +#define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2) +#define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2) +#define oaffs(a) ffs_t(pt_oaddr_t, a) +#define oafls(a) fls_t(pt_oaddr_t, a) +#define oaffz(a) ffz_t(pt_oaddr_t, a) + +static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem, + unsigned int top_level) +{ + return top_level | (uintptr_t)table_mem; +} + +static inline void pt_top_set(struct pt_common *common, + struct pt_table_p *table_mem, + unsigned int top_level) +{ + WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level)); +} + +static inline void pt_top_set_level(struct pt_common *common, + unsigned int top_level) +{ + pt_top_set(common, NULL, top_level); +} + +static inline unsigned int pt_top_get_level(const struct pt_common *common) +{ + return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS); +} + +static inline bool pt_check_install_leaf_args(struct pt_state *pts, + pt_oaddr_t oa, + unsigned int oasz_lg2); + +#endif diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h new file mode 100644 index 000000000000..60d594bbb106 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Default definitions for formats that don't define these functions. + */ +#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H +#define __GENERIC_PT_PT_FMT_DEFAULTS_H + +#include "pt_defs.h" +#include + +/* Header self-compile default defines */ +#ifndef pt_load_entry_raw +#include "fmt/amdv1.h" +#endif + +/* + * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and + * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top. + */ +#ifndef pt_table_item_lg2sz +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts) +{ + return PT_GRANULE_LG2SZ + + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level; +} +#endif + +#ifndef pt_pgsz_lg2_to_level +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, + unsigned int pgsize_lg2) +{ + return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) / + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)); +} +#endif + +/* + * If not supplied by the format then contiguous pages are not supported. + * + * If contiguous pages are supported then the format must also provide + * pt_contig_count_lg2() if it supports a single contiguous size per level, + * or pt_possible_sizes() if it supports multiple sizes per level. + */ +#ifndef pt_entry_num_contig_lg2 +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts) +{ + return ilog2(1); +} + +/* + * Return the number of contiguous OA items forming an entry at this table level + */ +static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts) +{ + return ilog2(1); +} +#endif + +/* If not supplied by the format then dirty tracking is not supported */ +#ifndef pt_entry_is_write_dirty +static inline bool pt_entry_is_write_dirty(const struct pt_state *pts) +{ + return false; +} + +static inline void pt_entry_make_write_clean(struct pt_state *pts) +{ +} + +static inline bool pt_dirty_supported(struct pt_common *common) +{ + return false; +} +#else +/* If not supplied then dirty tracking is always enabled */ +#ifndef pt_dirty_supported +static inline bool pt_dirty_supported(struct pt_common *common) +{ + return true; +} +#endif +#endif + +#ifndef pt_entry_make_write_dirty +static inline bool pt_entry_make_write_dirty(struct pt_state *pts) +{ + return false; +} +#endif + +/* + * Format supplies either: + * pt_entry_oa - OA is at the start of a contiguous entry + * or + * pt_item_oa - OA is adjusted for every item in a contiguous entry + * + * Build the missing one + * + * The internal helper _pt_entry_oa_fast() allows generating + * an efficient pt_entry_oa_exact(), it doesn't care which + * option is selected. + */ +#ifdef pt_entry_oa +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts) +{ + return pt_entry_oa(pts) | + log2_mul(pts->index, pt_table_item_lg2sz(pts)); +} +#define _pt_entry_oa_fast pt_entry_oa +#endif + +#ifdef pt_item_oa +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts) +{ + return log2_set_mod(pt_item_oa(pts), 0, + pt_entry_num_contig_lg2(pts) + + pt_table_item_lg2sz(pts)); +} +#define _pt_entry_oa_fast pt_item_oa +#endif + +/* + * If not supplied by the format then use the constant + * PT_MAX_OUTPUT_ADDRESS_LG2. + */ +#ifndef pt_max_oa_lg2 +static inline unsigned int +pt_max_oa_lg2(const struct pt_common *common) +{ + return PT_MAX_OUTPUT_ADDRESS_LG2; +} +#endif + +#ifndef pt_has_system_page_size +static inline bool pt_has_system_page_size(const struct pt_common *common) +{ + return PT_GRANULE_LG2SZ == PAGE_SHIFT; +} +#endif + +/* + * If not supplied by the format then assume only one contiguous size determined + * by pt_contig_count_lg2() + */ +#ifndef pt_possible_sizes +static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts); + +/* Return a bitmap of possible leaf page sizes at this level */ +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!pt_can_have_leaf(pts)) + return 0; + return log2_to_int(isz_lg2) | + log2_to_int(pt_contig_count_lg2(pts) + isz_lg2); +} +#endif + +/* If not supplied by the format then use 0. */ +#ifndef pt_full_va_prefix +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common) +{ + return 0; +} +#endif + +/* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */ +#ifndef pt_clear_entries +static inline void pt_clear_entries64(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); +} + +static inline void pt_clear_entries32(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u32 *tablep = pt_cur_table(pts, u32) + pts->index; + u32 *end = tablep + log2_to_int(num_contig_lg2); + + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); +} + +static inline void pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + if (PT_ITEM_WORD_SIZE == sizeof(u32)) + pt_clear_entries32(pts, num_contig_lg2); + else + pt_clear_entries64(pts, num_contig_lg2); +} +#define pt_clear_entries pt_clear_entries +#endif + +/* + * Format can call in the pt_install_leaf_entry() to check the arguments are all + * aligned correctly. + */ +static inline bool pt_check_install_leaf_args(struct pt_state *pts, + pt_oaddr_t oa, + unsigned int oasz_lg2) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2))) + return false; + +#ifdef pt_possible_sizes + if (PT_WARN_ON(isz_lg2 > oasz_lg2 || + oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts))) + return false; +#else + if (PT_WARN_ON(oasz_lg2 != isz_lg2 && + oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts))) + return false; +#endif + + if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2))) + return false; + return true; +} + +#endif diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h new file mode 100644 index 000000000000..87f4a26c1a41 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_iter.h @@ -0,0 +1,636 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Iterators for Generic Page Table + */ +#ifndef __GENERIC_PT_PT_ITER_H +#define __GENERIC_PT_PT_ITER_H + +#include "pt_common.h" + +#include + +/* + * Use to mangle symbols so that backtraces and the symbol table are + * understandable. Any non-inlined function should get mangled like this. + */ +#define NS(fn) CONCATENATE(PTPFX, fn) + +/** + * pt_check_range() - Validate the range can be iterated + * @range: Range to validate + * + * Check that VA and last_va fall within the permitted range of VAs. If the + * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension + * is correct. + */ +static inline int pt_check_range(struct pt_range *range) +{ + pt_vaddr_t prefix; + + PT_WARN_ON(!range->max_vasz_lg2); + + if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) { + PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2); + prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ? + PT_VADDR_MAX : + 0; + } else { + prefix = pt_full_va_prefix(range->common); + } + + if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) || + !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2)) + return -ERANGE; + return 0; +} + +/** + * pt_index_to_va() - Update range->va to the current pts->index + * @pts: Iteration State + * + * Adjust range->va to match the current index. This is done in a lazy manner + * since computing the VA takes several instructions and is rarely required. + */ +static inline void pt_index_to_va(struct pt_state *pts) +{ + pt_vaddr_t lower_va; + + lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts)); + pts->range->va = fvalog2_set_mod(pts->range->va, lower_va, + pt_table_oa_lg2sz(pts)); +} + +/* + * Add index_count_lg2 number of entries to pts's VA and index. The VA will be + * adjusted to the end of the contiguous block if it is currently in the middle. + */ +static inline void _pt_advance(struct pt_state *pts, + unsigned int index_count_lg2) +{ + pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0, + index_count_lg2); +} + +/** + * pt_entry_fully_covered() - Check if the item or entry is entirely contained + * within pts->range + * @pts: Iteration State + * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or + * pt_entry_oa_lg2sz() + * + * Returns: true if the item is fully enclosed by the pts->range. + */ +static inline bool pt_entry_fully_covered(const struct pt_state *pts, + unsigned int oasz_lg2) +{ + struct pt_range *range = pts->range; + + /* Range begins at the start of the entry */ + if (log2_mod(pts->range->va, oasz_lg2)) + return false; + + /* Range ends past the end of the entry */ + if (!log2_div_eq(range->va, range->last_va, oasz_lg2)) + return true; + + /* Range ends at the end of the entry */ + return log2_mod_eq_max(range->last_va, oasz_lg2); +} + +/** + * pt_range_to_index() - Starting index for an iteration + * @pts: Iteration State + * + * Return: the starting index for the iteration in pts. + */ +static inline unsigned int pt_range_to_index(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + PT_WARN_ON(pts->level > pts->range->top_level); + if (pts->range->top_level == pts->level) + return log2_div(fvalog2_mod(pts->range->va, + pts->range->max_vasz_lg2), + isz_lg2); + return log2_mod(log2_div(pts->range->va, isz_lg2), + pt_num_items_lg2(pts)); +} + +/** + * pt_range_to_end_index() - Ending index iteration + * @pts: Iteration State + * + * Return: the last index for the iteration in pts. + */ +static inline unsigned int pt_range_to_end_index(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct pt_range *range = pts->range; + unsigned int num_entries_lg2; + + if (range->va == range->last_va) + return pts->index + 1; + + if (pts->range->top_level == pts->level) + return log2_div(fvalog2_mod(pts->range->last_va, + pts->range->max_vasz_lg2), + isz_lg2) + + 1; + + num_entries_lg2 = pt_num_items_lg2(pts); + + /* last_va falls within this table */ + if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2)) + return log2_mod(log2_div(pts->range->last_va, isz_lg2), + num_entries_lg2) + + 1; + + return log2_to_int(num_entries_lg2); +} + +static inline void _pt_iter_first(struct pt_state *pts) +{ + pts->index = pt_range_to_index(pts); + pts->end_index = pt_range_to_end_index(pts); + PT_WARN_ON(pts->index > pts->end_index); +} + +static inline bool _pt_iter_load(struct pt_state *pts) +{ + if (pts->index >= pts->end_index) + return false; + pt_load_entry(pts); + return true; +} + +/** + * pt_next_entry() - Advance pts to the next entry + * @pts: Iteration State + * + * Update pts to go to the next index at this level. If pts is pointing at a + * contiguous entry then the index may advance my more than one. + */ +static inline void pt_next_entry(struct pt_state *pts) +{ + if (pts->type == PT_ENTRY_OA && + !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0)) + _pt_advance(pts, pt_entry_num_contig_lg2(pts)); + else + pts->index++; + pt_index_to_va(pts); +} + +/** + * for_each_pt_level_entry() - For loop wrapper over entries in the range + * @pts: Iteration State + * + * This is the basic iteration primitive. It iterates over all the entries in + * pts->range that fall within the pts's current table level. Each step does + * pt_load_entry(pts). + */ +#define for_each_pt_level_entry(pts) \ + for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts)) + +/** + * pt_load_single_entry() - Version of pt_load_entry() usable within a walker + * @pts: Iteration State + * + * Alternative to for_each_pt_level_entry() if the walker function uses only a + * single entry. + */ +static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts) +{ + pts->index = pt_range_to_index(pts); + pt_load_entry(pts); + return pts->type; +} + +static __always_inline struct pt_range _pt_top_range(struct pt_common *common, + uintptr_t top_of_table) +{ + struct pt_range range = { + .common = common, + .top_table = + (struct pt_table_p *)(top_of_table & + ~(uintptr_t)PT_TOP_LEVEL_MASK), + .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS), + }; + struct pt_state pts = { .range = &range, .level = range.top_level }; + unsigned int max_vasz_lg2; + + max_vasz_lg2 = common->max_vasz_lg2; + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + pts.level != PT_MAX_TOP_LEVEL) + max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2, + pt_num_items_lg2(&pts) + + pt_table_item_lg2sz(&pts)); + + /* + * The top range will default to the lower region only with sign extend. + */ + range.max_vasz_lg2 = max_vasz_lg2; + if (pt_feature(common, PT_FEAT_SIGN_EXTEND)) + max_vasz_lg2--; + + range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2); + range.last_va = + fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2); + return range; +} + +/** + * pt_top_range() - Return a range that spans part of the top level + * @common: Table + * + * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the + * total page table. Otherwise it returns the entire page table. + */ +static __always_inline struct pt_range pt_top_range(struct pt_common *common) +{ + /* + * The top pointer can change without locking. We capture the value and + * it's level here and are safe to walk it so long as both values are + * captured without tearing. + */ + return _pt_top_range(common, READ_ONCE(common->top_of_table)); +} + +/** + * pt_all_range() - Return a range that spans the entire page table + * @common: Table + * + * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND + * is supported range->va and range->last_va will be incorrect during the + * iteration and must not be accessed. + */ +static inline struct pt_range pt_all_range(struct pt_common *common) +{ + struct pt_range range = pt_top_range(common); + + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) + return range; + + /* + * Pretend the table is linear from 0 without a sign extension. This + * generates the correct indexes for iteration. + */ + range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2); + return range; +} + +/** + * pt_upper_range() - Return a range that spans part of the top level + * @common: Table + * + * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the + * total page table. Otherwise it returns the entire page table. + */ +static inline struct pt_range pt_upper_range(struct pt_common *common) +{ + struct pt_range range = pt_top_range(common); + + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) + return range; + + range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1); + range.last_va = PT_VADDR_MAX; + return range; +} + +/** + * pt_make_range() - Return a range that spans part of the table + * @common: Table + * @va: Start address + * @last_va: Last address + * + * The caller must validate the range with pt_check_range() before using it. + */ +static __always_inline struct pt_range +pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va) +{ + struct pt_range range = + _pt_top_range(common, READ_ONCE(common->top_of_table)); + + range.va = va; + range.last_va = last_va; + + return range; +} + +/* + * Span a slice of the table starting at a lower table level from an active + * walk. + */ +static __always_inline struct pt_range +pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va, + pt_vaddr_t last_va) +{ + struct pt_range range = *parent; + + range.va = va; + range.last_va = last_va; + + PT_WARN_ON(last_va < va); + PT_WARN_ON(pt_check_range(&range)); + + return range; +} + +/** + * pt_init() - Initialize a pt_state on the stack + * @range: Range pointer to embed in the state + * @level: Table level for the state + * @table: Pointer to the table memory at level + * + * Helper to initialize the on-stack pt_state from walker arguments. + */ +static __always_inline struct pt_state +pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = { + .range = range, + .table = table, + .level = level, + }; + return pts; +} + +/** + * pt_init_top() - Initialize a pt_state on the stack + * @range: Range pointer to embed in the state + * + * The pt_state points to the top most level. + */ +static __always_inline struct pt_state pt_init_top(struct pt_range *range) +{ + return pt_init(range, range->top_level, range->top_table); +} + +typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table); + +/** + * pt_descend() - Recursively invoke the walker for the lower level + * @pts: Iteration State + * @arg: Value to pass to the function + * @fn: Walker function to call + * + * pts must point to a table item. Invoke fn as a walker on the table + * pts points to. + */ +static __always_inline int pt_descend(struct pt_state *pts, void *arg, + pt_level_fn_t fn) +{ + int ret; + + if (PT_WARN_ON(!pts->table_lower)) + return -EINVAL; + + ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower); + return ret; +} + +/** + * pt_walk_range() - Walk over a VA range + * @range: Range pointer + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * Walk over a VA range. The caller should have done a validity check, at + * least calling pt_check_range(), when building range. The walk will + * start at the top most table. + */ +static __always_inline int pt_walk_range(struct pt_range *range, + pt_level_fn_t fn, void *arg) +{ + return fn(range, arg, range->top_level, range->top_table); +} + +/* + * pt_walk_descend() - Recursively invoke the walker for a slice of a lower + * level + * @pts: Iteration State + * @va: Start address + * @last_va: Last address + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * With pts pointing at a table item this will descend and over a slice of the + * lower table. The caller must ensure that va/last_va are within the table + * item. This creates a new walk and does not alter pts or pts->range. + */ +static __always_inline int pt_walk_descend(const struct pt_state *pts, + pt_vaddr_t va, pt_vaddr_t last_va, + pt_level_fn_t fn, void *arg) +{ + struct pt_range range = pt_make_child_range(pts->range, va, last_va); + + if (PT_WARN_ON(!pt_can_have_table(pts)) || + PT_WARN_ON(!pts->table_lower)) + return -EINVAL; + + return fn(&range, arg, pts->level - 1, pts->table_lower); +} + +/* + * pt_walk_descend_all() - Recursively invoke the walker for a table item + * @parent_pts: Iteration State + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * With pts pointing at a table item this will descend and over the entire lower + * table. This creates a new walk and does not alter pts or pts->range. + */ +static __always_inline int +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn, + void *arg) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts); + + return pt_walk_descend(parent_pts, + log2_set_mod(parent_pts->range->va, 0, isz_lg2), + log2_set_mod_max(parent_pts->range->va, isz_lg2), + fn, arg); +} + +/** + * pt_range_slice() - Return a range that spans indexes + * @pts: Iteration State + * @start_index: Starting index within pts + * @end_index: Ending index within pts + * + * Create a range than spans an index range of the current table level + * pt_state points at. + */ +static inline struct pt_range pt_range_slice(const struct pt_state *pts, + unsigned int start_index, + unsigned int end_index) +{ + unsigned int table_lg2sz = pt_table_oa_lg2sz(pts); + pt_vaddr_t last_va; + pt_vaddr_t va; + + va = fvalog2_set_mod(pts->range->va, + log2_mul(start_index, pt_table_item_lg2sz(pts)), + table_lg2sz); + last_va = fvalog2_set_mod( + pts->range->va, + log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz); + return pt_make_child_range(pts->range, va, last_va); +} + +/** + * pt_top_memsize_lg2() + * @common: Table + * @top_of_table: Top of table value from _pt_top_set() + * + * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this + * will compute the top size assuming the table will grow. + */ +static inline unsigned int pt_top_memsize_lg2(struct pt_common *common, + uintptr_t top_of_table) +{ + struct pt_range range = _pt_top_range(common, top_of_table); + struct pt_state pts = pt_init_top(&range); + unsigned int num_items_lg2; + + num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts); + if (range.top_level != PT_MAX_TOP_LEVEL && + pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts)); + + /* Round up the allocation size to the minimum alignment */ + return max(ffs_t(u64, PT_TOP_PHYS_MASK), + num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE)); +} + +/** + * pt_compute_best_pgsize() - Determine the best page size for leaf entries + * @pgsz_bitmap: Permitted page sizes + * @va: Starting virtual address for the leaf entry + * @last_va: Last virtual address for the leaf entry, sets the max page size + * @oa: Starting output address for the leaf entry + * + * Compute the largest page size for va, last_va, and oa together and return it + * in lg2. The largest page size depends on the format's supported page sizes at + * this level, and the relative alignment of the VA and OA addresses. 0 means + * the OA cannot be stored with the provided pgsz_bitmap. + */ +static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap, + pt_vaddr_t va, + pt_vaddr_t last_va, + pt_oaddr_t oa) +{ + unsigned int best_pgsz_lg2; + unsigned int pgsz_lg2; + pt_vaddr_t len = last_va - va + 1; + pt_vaddr_t mask; + + if (PT_WARN_ON(va >= last_va)) + return 0; + + /* + * Given a VA/OA pair the best page size is the largest page size + * where: + * + * 1) VA and OA start at the page. Bitwise this is the count of least + * significant 0 bits. + * This also implies that last_va/oa has the same prefix as va/oa. + */ + mask = va | oa; + + /* + * 2) The page size is not larger than the last_va (length). Since page + * sizes are always power of two this can't be larger than the + * largest power of two factor of the length. + */ + mask |= log2_to_int(vafls(len) - 1); + + best_pgsz_lg2 = vaffs(mask); + + /* Choose the highest bit <= best_pgsz_lg2 */ + if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1) + pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1); + + pgsz_lg2 = vafls(pgsz_bitmap); + if (!pgsz_lg2) + return 0; + + pgsz_lg2--; + + PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0); + PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0); + PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va); + PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); + PT_WARN_ON( + !oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); + return pgsz_lg2; +} + +#define _PT_MAKE_CALL_LEVEL(fn) \ + static __always_inline int fn(struct pt_range *range, void *arg, \ + unsigned int level, \ + struct pt_table_p *table) \ + { \ + static_assert(PT_MAX_TOP_LEVEL <= 5); \ + if (level == 0) \ + return CONCATENATE(fn, 0)(range, arg, 0, table); \ + if (level == 1 || PT_MAX_TOP_LEVEL == 1) \ + return CONCATENATE(fn, 1)(range, arg, 1, table); \ + if (level == 2 || PT_MAX_TOP_LEVEL == 2) \ + return CONCATENATE(fn, 2)(range, arg, 2, table); \ + if (level == 3 || PT_MAX_TOP_LEVEL == 3) \ + return CONCATENATE(fn, 3)(range, arg, 3, table); \ + if (level == 4 || PT_MAX_TOP_LEVEL == 4) \ + return CONCATENATE(fn, 4)(range, arg, 4, table); \ + return CONCATENATE(fn, 5)(range, arg, 5, table); \ + } + +static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg, + unsigned int unused_level, + struct pt_table_p *table) +{ + static_assert(PT_MAX_TOP_LEVEL <= 5); + return -EPROTOTYPE; +} + +#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \ + static inline int fn(struct pt_range *range, void *arg, \ + unsigned int unused_level, \ + struct pt_table_p *table) \ + { \ + return do_fn(range, arg, level, table, descend_fn); \ + } + +/** + * PT_MAKE_LEVELS() - Build an unwound walker + * @fn: Name of the walker function + * @do_fn: Function to call at each level + * + * This builds a function call tree that can be fully inlined. + * The caller must provide a function body in an __always_inline function:: + * + * static __always_inline int do(struct pt_range *range, void *arg, + * unsigned int level, struct pt_table_p *table, + * pt_level_fn_t descend_fn) + * + * An inline function will be created for each table level that calls do_fn with + * a compile time constant for level and a pointer to the next lower function. + * This generates an optimally inlined walk where each of the functions sees a + * constant level and can codegen the exact constants/etc for that level. + * + * Note this can produce a lot of code! + */ +#define PT_MAKE_LEVELS(fn, do_fn) \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \ + do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \ + _PT_MAKE_CALL_LEVEL(fn) + +#endif diff --git a/drivers/iommu/generic_pt/pt_log2.h b/drivers/iommu/generic_pt/pt_log2.h new file mode 100644 index 000000000000..6dbbed119238 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_log2.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Helper macros for working with log2 values + * + */ +#ifndef __GENERIC_PT_LOG2_H +#define __GENERIC_PT_LOG2_H +#include +#include + +/* Compute a */ +#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2))) +static_assert(log2_to_int_t(unsigned int, 0) == 1); + +/* Compute a - 1 (aka all low bits set) */ +#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1)) + +/* Compute a / b */ +#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2))) +static_assert(log2_div_t(unsigned int, 4, 2) == 1); + +/* + * Compute: + * a / c == b / c + * aka the high bits are equal + */ +#define log2_div_eq_t(type, a, b, c_lg2) \ + (log2_div_t(type, (a) ^ (b), c_lg2) == 0) +static_assert(log2_div_eq_t(unsigned int, 1, 1, 2)); + +/* Compute a % b */ +#define log2_mod_t(type, a, b_lg2) \ + ((type)(((type)a) & log2_to_max_int_t(type, b_lg2))) +static_assert(log2_mod_t(unsigned int, 1, 2) == 1); + +/* + * Compute: + * a % b == b - 1 + * aka the low bits are all 1s + */ +#define log2_mod_eq_max_t(type, a, b_lg2) \ + (log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2)) +static_assert(log2_mod_eq_max_t(unsigned int, 3, 2)); + +/* + * Return a value such that: + * a / b == ret / b + * ret % b == val + * aka set the low bits to val. val must be < b + */ +#define log2_set_mod_t(type, a, val, b_lg2) \ + ((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val))) +static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1); + +/* Return a value such that: + * a / b == ret / b + * ret % b == b - 1 + * aka set the low bits to all 1s + */ +#define log2_set_mod_max_t(type, a, b_lg2) \ + (((type)(a)) | log2_to_max_int_t(type, b_lg2)) +static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3); + +/* Compute a * b */ +#define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2))) +static_assert(log2_mul_t(unsigned int, 2, 2) == 8); + +#define _dispatch_sz(type, fn, a) \ + (sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a)) + +/* + * Return the highest value such that: + * fls_t(u32, 0) == 0 + * fls_t(u3, 1) == 1 + * a >= log2_to_int(ret - 1) + * aka find last set bit + */ +static inline unsigned int fls32(u32 a) +{ + return fls(a); +} +#define fls_t(type, a) _dispatch_sz(type, fls, a) + +/* + * Return the highest value such that: + * ffs_t(u32, 0) == UNDEFINED + * ffs_t(u32, 1) == 0 + * log_mod(a, ret) == 0 + * aka find first set bit + */ +static inline unsigned int __ffs32(u32 a) +{ + return __ffs(a); +} +#define ffs_t(type, a) _dispatch_sz(type, __ffs, a) + +/* + * Return the highest value such that: + * ffz_t(u32, U32_MAX) == UNDEFINED + * ffz_t(u32, 0) == 0 + * ffz_t(u32, 1) == 1 + * log_mod(a, ret) == log_to_max_int(ret) + * aka find first zero bit + */ +static inline unsigned int ffz32(u32 a) +{ + return ffz(a); +} +static inline unsigned int ffz64(u64 a) +{ + if (sizeof(u64) == sizeof(unsigned long)) + return ffz(a); + + if ((u32)a == U32_MAX) + return ffz32(a >> 32) + 32; + return ffz32(a); +} +#define ffz_t(type, a) _dispatch_sz(type, ffz, a) + +#endif diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h new file mode 100644 index 000000000000..e69a75511313 --- /dev/null +++ b/include/linux/generic_pt/common.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __GENERIC_PT_COMMON_H +#define __GENERIC_PT_COMMON_H + +#include +#include +#include + +/** + * DOC: Generic Radix Page Table + * + * Generic Radix Page Table is a set of functions and helpers to efficiently + * parse radix style page tables typically seen in HW implementations. The + * interface is built to deliver similar code generation as the mm's pte/pmd/etc + * system by fully inlining the exact code required to handle each table level. + * + * Like the mm subsystem each format contributes its parsing implementation + * under common names and the common code implements the required algorithms. + * + * The system is divided into three logical levels: + * + * - The page table format and its manipulation functions + * - Generic helpers to give a consistent API regardless of underlying format + * - An algorithm implementation (e.g. IOMMU/DRM/KVM/MM) + * + * Multiple implementations are supported. The intention is to have the generic + * format code be re-usable for whatever specialized implementation is required. + * The generic code is solely about the format of the radix tree; it does not + * include memory allocation or higher level decisions that are left for the + * implementation. + * + * The generic framework supports a superset of functions across many HW + * implementations: + * + * - Entries comprised of contiguous blocks of IO PTEs for larger page sizes + * - Multi-level tables, up to 6 levels. Runtime selected top level + * - Runtime variable table level size (ARM's concatenated tables) + * - Expandable top level allowing dynamic sizing of table levels + * - Optional leaf entries at any level + * - 32-bit/64-bit virtual and output addresses, using every address bit + * - Dirty tracking + * - Sign extended addressing + */ + +/** + * struct pt_common - struct for all page table implementations + */ +struct pt_common { + /** + * @top_of_table: Encodes the table top pointer and the top level in a + * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower + * bits of the aligned table pointer are used for the level. + */ + uintptr_t top_of_table; + /** + * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits + * must be zero. This may be less than what the page table format + * supports, but must not be more. + */ + u8 max_oasz_lg2; + /** + * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits + * are 0 or 1 depending on pt_full_va_prefix(). This may be less than + * what the page table format supports, but must not be more. When + * PT_FEAT_DYNAMIC_TOP is set this reflects the maximum VA capability. + */ + u8 max_vasz_lg2; + /** + * @features: Bitmap of `enum pt_features` + */ + unsigned int features; +}; + +/* Encoding parameters for top_of_table */ +enum { + PT_TOP_LEVEL_BITS = 3, + PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0), +}; + +/** + * enum pt_features - Features turned on in the table. Each symbol is a bit + * position. + */ +enum pt_features { + /** + * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to + * PT_VADDR_MAX. + */ + PT_FEAT_FULL_VA, + /** + * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased + * dynamically during map. This requires HW support for atomically + * setting both the table top pointer and the starting table level. + */ + PT_FEAT_DYNAMIC_TOP, + /** + * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign + * extends up to the full pt_vaddr_t. This divides the page table into + * three VA ranges:: + * + * 0 -> 2^N - 1 Lower + * 2^N -> (MAX - 2^N - 1) Non-Canonical + * MAX - 2^N -> MAX Upper + * + * In this mode pt_common::max_vasz_lg2 includes the sign bit and the + * upper bits that don't fall within the translation are just validated. + * + * If not set there is no sign extension and valid VA goes from 0 to 2^N + * - 1. + */ + PT_FEAT_SIGN_EXTEND, + /** + * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA + * ranges which will clean out any walk cache or any IOPTE fully + * contained by the range. The optimization objective is to minimize the + * number of flushes even if ranges include IOVA gaps that do not need + * to be flushed. + */ + PT_FEAT_FLUSH_RANGE, + /** + * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that + * the optimization objective is to only flush IOVA that has been + * changed. This mode is suitable for cases like hypervisor shadowing + * where flushing unchanged ranges may cause the hypervisor to reparse + * significant amount of page table. + */ + PT_FEAT_FLUSH_RANGE_NO_GAPS, + /* private: */ + PT_FEAT_FMT_START, +}; + +#endif -- cgit v1.2.3 From cdb39d9185795b744dab4d4d782f2fe3f5eca10c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:01 -0400 Subject: iommupt: Add the basic structure of the iommu implementation The existing IOMMU page table implementations duplicate all of the working algorithms for each format. By using the generic page table API a single C version of the IOMMU algorithms can be created and re-used for all of the different formats used in the drivers. The implementation will provide a single C version of the iommu domain operations: iova_to_phys, map, unmap, and read_and_clear_dirty. Further, adding new algorithms and techniques becomes easy to do across the entire fleet of drivers and formats. The C functions are drop in compatible with the existing iommu_domain_ops using the IOMMU_PT_DOMAIN_OPS() macro. Each per-format implementation compilation unit will produce exported symbols following the pattern pt_iommu_FMT_map_pages() which the macro directly maps to the iommu_domain_ops members. This avoids the additional function pointer indirection like io-pgtable has. The top level struct used by the drivers is pt_iommu_table_FMT. It contains the other structs to allow container_of() to move between the driver, iommu page table, generic page table, and generic format layers. struct pt_iommu_table_amdv1 { struct pt_iommu { struct iommu_domain domain; } iommu; struct pt_amdv1 { struct pt_common common; } amdpt; }; The driver is expected to union the pt_iommu_table_FMT with its own existing domain struct: struct driver_domain { union { struct iommu_domain domain; struct pt_iommu_table_amdv1 amdv1; }; }; PT_IOMMU_CHECK_DOMAIN(struct driver_domain, amdv1, domain); To create an alias to avoid renaming 'domain' in a lot of driver code. This allows all the layers to access all the necessary functions to implement their different roles with no change to any of the existing iommu core code. Implement the basic starting point: pt_iommu_init(), get_info() and deinit(). Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/Kconfig | 13 ++ drivers/iommu/generic_pt/fmt/iommu_template.h | 39 ++++ drivers/iommu/generic_pt/iommu_pt.h | 259 ++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 150 +++++++++++++++ 4 files changed, 461 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h create mode 100644 drivers/iommu/generic_pt/iommu_pt.h create mode 100644 include/linux/generic_pt/iommu.h (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index fb0f431ddba0..a81dfdd72ca0 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -17,4 +17,17 @@ config DEBUG_GENERIC_PT kernels. The kunit tests require this to be enabled to get full coverage. + +config IOMMU_PT + tristate "IOMMU Page Tables" + select IOMMU_API + depends on IOMMU_SUPPORT + depends on GENERIC_PT + help + Generic library for building IOMMU page tables + + IOMMU_PT provides an implementation of the page table operations + related to struct iommu_domain using GENERIC_PT. It provides a single + implementation of the page table operations that can be shared by + multiple drivers. endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h new file mode 100644 index 000000000000..5b631bc07cbc --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_template.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Template to build the iommu module and kunit from the format and + * implementation headers. + * + * The format should have: + * #define PT_FMT + * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy)) + * And optionally: + * #define PT_FORCE_ENABLED_FEATURES .. + * #define PT_FMT_VARIANT + */ +#include +#include + +#ifdef PT_FMT_VARIANT +#define PTPFX_RAW \ + CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT) +#else +#define PTPFX_RAW PT_FMT +#endif + +#define PTPFX CONCATENATE(PTPFX_RAW, _) + +#define _PT_FMT_H PT_FMT.h +#define PT_FMT_H __stringify(_PT_FMT_H) + +#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H) +#define PT_DEFS_H __stringify(_PT_DEFS_H) + +#include +#include PT_DEFS_H +#include "../pt_defs.h" +#include PT_FMT_H +#include "../pt_common.h" + +#include "../iommu_pt.h" diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h new file mode 100644 index 000000000000..564f2d3a6e11 --- /dev/null +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * "Templated C code" for implementing the iommu operations for page tables. + * This is compiled multiple times, over all the page table formats to pick up + * the per-format definitions. + */ +#ifndef __GENERIC_PT_IOMMU_PT_H +#define __GENERIC_PT_IOMMU_PT_H + +#include "pt_iter.h" + +#include +#include +#include "../iommu-pages.h" + +#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) + +struct pt_iommu_collect_args { + struct iommu_pages_list free_list; +}; + +static int __collect_tables(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_collect_args *collect = arg; + int ret; + + if (!pt_can_have_table(&pts)) + return 0; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + iommu_pages_list_add(&collect->free_list, pts.table_lower); + ret = pt_descend(&pts, arg, __collect_tables); + if (ret) + return ret; + continue; + } + } + return 0; +} + +static inline struct pt_table_p *table_alloc_top(struct pt_common *common, + uintptr_t top_of_table, + gfp_t gfp) +{ + struct pt_iommu *iommu_table = iommu_from_common(common); + + /* + * Top doesn't need the free list or otherwise, so it technically + * doesn't need to use iommu pages. Use the API anyhow as the top is + * usually not smaller than PAGE_SIZE to keep things simple. + */ + return iommu_alloc_pages_node_sz( + iommu_table->nid, gfp, + log2_to_int(pt_top_memsize_lg2(common, top_of_table))); +} + +static void NS(get_info)(struct pt_iommu *iommu_table, + struct pt_iommu_info *info) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_top_range(common); + struct pt_state pts = pt_init_top(&range); + pt_vaddr_t pgsize_bitmap = 0; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) { + for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL; + pts.level++) { + if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) + break; + pgsize_bitmap |= pt_possible_sizes(&pts); + } + } else { + for (pts.level = 0; pts.level <= range.top_level; pts.level++) + pgsize_bitmap |= pt_possible_sizes(&pts); + } + + /* Hide page sizes larger than the maximum OA */ + info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2); +} + +static void NS(deinit)(struct pt_iommu *iommu_table) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_all_range(common); + struct pt_iommu_collect_args collect = { + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), + }; + + iommu_pages_list_add(&collect.free_list, range.top_table); + pt_walk_range(&range, __collect_tables, &collect); + + /* + * The driver has to already have fenced the HW access to the page table + * and invalidated any caching referring to this memory. + */ + iommu_put_pages_list(&collect.free_list); +} + +static const struct pt_iommu_ops NS(ops) = { + .get_info = NS(get_info), + .deinit = NS(deinit), +}; + +static int pt_init_common(struct pt_common *common) +{ + struct pt_range top_range = pt_top_range(common); + + if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL)) + return -EINVAL; + + if (top_range.top_level == PT_MAX_TOP_LEVEL || + common->max_vasz_lg2 == top_range.max_vasz_lg2) + common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP); + + if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2) + common->features |= BIT(PT_FEAT_FULL_VA); + + /* Requested features must match features compiled into this format */ + if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) || + (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) && + (common->features & PT_FORCE_ENABLED_FEATURES) != + PT_FORCE_ENABLED_FEATURES)) + return -EOPNOTSUPP; + + if (common->max_oasz_lg2 == 0) + common->max_oasz_lg2 = pt_max_oa_lg2(common); + else + common->max_oasz_lg2 = min(common->max_oasz_lg2, + pt_max_oa_lg2(common)); + return 0; +} + +static int pt_iommu_init_domain(struct pt_iommu *iommu_table, + struct iommu_domain *domain) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_iommu_info info; + struct pt_range range; + + NS(get_info)(iommu_table, &info); + + domain->type = __IOMMU_DOMAIN_PAGING; + domain->pgsize_bitmap = info.pgsize_bitmap; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + range = _pt_top_range(common, + _pt_top_set(NULL, PT_MAX_TOP_LEVEL)); + else + range = pt_top_range(common); + + /* A 64-bit high address space table on a 32-bit system cannot work. */ + domain->geometry.aperture_start = (unsigned long)range.va; + if ((pt_vaddr_t)domain->geometry.aperture_start != range.va) + return -EOVERFLOW; + + /* + * The aperture is limited to what the API can do after considering all + * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used + * to store a VA. Set the aperture to something that is valid for all + * cases. Saturate instead of truncate the end if the types are smaller + * than the top range. aperture_end should be called aperture_last. + */ + domain->geometry.aperture_end = (unsigned long)range.last_va; + if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) { + domain->geometry.aperture_end = ULONG_MAX; + domain->pgsize_bitmap &= ULONG_MAX; + } + domain->geometry.force_aperture = true; + + return 0; +} + +static void pt_iommu_zero(struct pt_iommu_table *fmt_table) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_iommu cfg = *iommu_table; + + static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0); + memset_after(fmt_table, 0, iommu.domain); + + /* The caller can initialize some of these values */ + iommu_table->nid = cfg.nid; +} + +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) + +int pt_iommu_init(struct pt_iommu_table *fmt_table, + const struct pt_iommu_table_cfg *cfg, gfp_t gfp) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_table_p *table_mem; + int ret; + + if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 || + !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2) + return -EINVAL; + + pt_iommu_zero(fmt_table); + common->features = cfg->common.features; + common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2; + common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2; + ret = pt_iommu_fmt_init(fmt_table, cfg); + if (ret) + return ret; + + if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common)) + return -EINVAL; + + ret = pt_init_common(common); + if (ret) + return ret; + + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && + (pt_feature(common, PT_FEAT_FULL_VA) || + pt_feature(common, PT_FEAT_DYNAMIC_TOP))) + return -EINVAL; + + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain); + if (ret) + return ret; + + table_mem = table_alloc_top(common, common->top_of_table, gfp); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + pt_top_set(common, table_mem, pt_top_get_level(common)); + + /* Must be last, see pt_iommu_deinit() */ + iommu_table->ops = &NS(ops); + return 0; +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU"); + +#ifdef pt_iommu_fmt_hw_info +#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info) +#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info) +void pt_iommu_hw_info(struct pt_iommu_table *fmt_table, + struct pt_iommu_table_hw_info *info) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range top_range = pt_top_range(common); + + pt_iommu_fmt_hw_info(fmt_table, &top_range, info); +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); +#endif + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW)); +MODULE_IMPORT_NS("GENERIC_PT"); + +#endif /* __GENERIC_PT_IOMMU_PT_H */ diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h new file mode 100644 index 000000000000..defa96abc497 --- /dev/null +++ b/include/linux/generic_pt/iommu.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __GENERIC_PT_IOMMU_H +#define __GENERIC_PT_IOMMU_H + +#include +#include +#include + +struct pt_iommu_ops; + +/** + * DOC: IOMMU Radix Page Table + * + * The IOMMU implementation of the Generic Page Table provides an ops struct + * that is useful to go with an iommu_domain to serve the DMA API, IOMMUFD and + * the generic map/unmap interface. + * + * This interface uses a caller provided locking approach. The caller must have + * a VA range lock concept that prevents concurrent threads from calling ops on + * the same VA. Generally the range lock must be at least as large as a single + * map call. + */ + +/** + * struct pt_iommu - Base structure for IOMMU page tables + * + * The format-specific struct will include this as the first member. + */ +struct pt_iommu { + /** + * @domain: The core IOMMU domain. The driver should use a union to + * overlay this memory with its previously existing domain struct to + * create an alias. + */ + struct iommu_domain domain; + + /** + * @ops: Function pointers to access the API + */ + const struct pt_iommu_ops *ops; + + /** + * @nid: Node ID to use for table memory allocations. The IOMMU driver + * may want to set the NID to the device's NID, if there are multiple + * table walkers. + */ + int nid; +}; + +/** + * struct pt_iommu_info - Details about the IOMMU page table + * + * Returned from pt_iommu_ops->get_info() + */ +struct pt_iommu_info { + /** + * @pgsize_bitmap: A bitmask where each set bit indicates + * a page size that can be natively stored in the page table. + */ + u64 pgsize_bitmap; +}; + +struct pt_iommu_ops { + /** + * @get_info: Return the pt_iommu_info structure + * @iommu_table: Table to query + * + * Return some basic static information about the page table. + */ + void (*get_info)(struct pt_iommu *iommu_table, + struct pt_iommu_info *info); + + /** + * @deinit: Undo a format specific init operation + * @iommu_table: Table to destroy + * + * Release all of the memory. The caller must have already removed the + * table from all HW access and all caches. + */ + void (*deinit)(struct pt_iommu *iommu_table); +}; + +static inline void pt_iommu_deinit(struct pt_iommu *iommu_table) +{ + /* + * It is safe to call pt_iommu_deinit() before an init, or if init + * fails. The ops pointer will only become non-NULL if deinit needs to be + * run. + */ + if (iommu_table->ops) + iommu_table->ops->deinit(iommu_table); +} + +/** + * struct pt_iommu_cfg - Common configuration values for all formats + */ +struct pt_iommu_cfg { + /** + * @features: Features required. Only these features will be turned on. + * The feature list should reflect what the IOMMU HW is capable of. + */ + unsigned int features; + /** + * @hw_max_vasz_lg2: Maximum VA the IOMMU HW can support. This will + * imply the top level of the table. + */ + u8 hw_max_vasz_lg2; + /** + * @hw_max_oasz_lg2: Maximum OA the IOMMU HW can support. The format + * might select a lower maximum OA. + */ + u8 hw_max_oasz_lg2; +}; + +/* Generate the exported function signatures from iommu_pt.h */ +#define IOMMU_PROTOTYPES(fmt) \ + int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ + const struct pt_iommu_##fmt##_cfg *cfg, \ + gfp_t gfp); \ + void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \ + struct pt_iommu_##fmt##_hw_info *info) +#define IOMMU_FORMAT(fmt, member) \ + struct pt_iommu_##fmt { \ + struct pt_iommu iommu; \ + struct pt_##fmt member; \ + }; \ + IOMMU_PROTOTYPES(fmt) + +/* + * The driver should setup its domain struct like + * union { + * struct iommu_domain domain; + * struct pt_iommu_xxx xx; + * }; + * PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, xx.iommu, domain); + * + * Which creates an alias between driver_domain.domain and + * driver_domain.xx.iommu.domain. This is to avoid a mass rename of existing + * driver_domain.domain users. + */ +#define PT_IOMMU_CHECK_DOMAIN(s, pt_iommu_memb, domain_memb) \ + static_assert(offsetof(s, pt_iommu_memb.domain) == \ + offsetof(s, domain_memb)) + +#undef IOMMU_PROTOTYPES +#undef IOMMU_FORMAT +#endif -- cgit v1.2.3 From 879ced2bab1ba95e98fac56c9503791183bc7cbb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:02 -0400 Subject: iommupt: Add the AMD IOMMU v1 page table format AMD IOMMU v1 is unique in supporting contiguous pages with a variable size and it can decode the full 64 bit VA space. Unlike other x86 page tables this explicitly does not do sign extension as part of allowing the entire 64 bit VA space to be supported. The general design is quite similar to the x86 PAE format, except with a 6th level and quite different PTE encoding. This format is the only one that uses the PT_FEAT_DYNAMIC_TOP feature in the existing code as the existing AMDv1 code starts out with a 3 level table and adds levels on the fly if more IOVA is needed. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 65,64 , 62,61 , -1.01 2^13, 70,66 , 67,62 , -8.08 2^14, 73,69 , 71,65 , -9.09 2^15, 78,75 , 75,71 , -5.05 2^16, 89,89 , 86,84 , -2.02 2^17, 128,121 , 124,112 , -10.10 2^18, 175,175 , 170,163 , -4.04 2^19, 264,306 , 261,279 , 6.06 2^20, 444,525 , 438,489 , 10.10 2^21, 60,62 , 58,59 , 1.01 256*2^12, 381,1833 , 367,1795 , 79.79 256*2^21, 375,1623 , 356,1555 , 77.77 256*2^30, 356,1338 , 349,1277 , 72.72 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 76,89 , 71,86 , 17.17 2^13, 79,89 , 75,86 , 12.12 2^14, 78,90 , 74,86 , 13.13 2^15, 82,89 , 74,86 , 13.13 2^16, 79,89 , 74,86 , 13.13 2^17, 81,89 , 77,87 , 11.11 2^18, 90,92 , 87,89 , 2.02 2^19, 91,93 , 88,90 , 2.02 2^20, 96,95 , 91,92 , 1.01 2^21, 72,88 , 68,85 , 20.20 256*2^12, 372,6583 , 364,6251 , 94.94 256*2^21, 398,6032 , 392,5758 , 93.93 256*2^30, 396,5665 , 389,5258 , 92.92 The ~5-17x speedup when working with mutli-PTE map/unmaps is because the AMD implementation rewalks the entire table on every new PTE while this version retains its position. The same speedup will be seen with dirtys as well. The old implementation triggers a compiler optimization that ends up generating a "rep stos" memset for contiguous PTEs. Since AMD can have contiguous PTEs that span 2Kbytes of table this is a huge win compared to a normal movq loop. It is why the unmap side has a fairly flat runtime as the contiguous PTE sides increases. This version makes it explicit with a memset64() call. Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/Makefile | 1 + drivers/iommu/generic_pt/Kconfig | 12 + drivers/iommu/generic_pt/fmt/Makefile | 11 + drivers/iommu/generic_pt/fmt/amdv1.h | 387 +++++++++++++++++++++++++++++ drivers/iommu/generic_pt/fmt/defs_amdv1.h | 21 ++ drivers/iommu/generic_pt/fmt/iommu_amdv1.c | 15 ++ include/linux/generic_pt/common.h | 19 ++ include/linux/generic_pt/iommu.h | 12 + 8 files changed, 478 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/Makefile create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c (limited to 'include/linux') diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 355294fa9033..b17ef9818759 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -3,6 +3,7 @@ obj-y += arm/ iommufd/ obj-$(CONFIG_AMD_IOMMU) += amd/ obj-$(CONFIG_INTEL_IOMMU) += intel/ obj-$(CONFIG_RISCV_IOMMU) += riscv/ +obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o obj-$(CONFIG_IOMMU_API) += iommu-traces.o diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index a81dfdd72ca0..cbdad222923b 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -30,4 +30,16 @@ config IOMMU_PT related to struct iommu_domain using GENERIC_PT. It provides a single implementation of the page table operations that can be shared by multiple drivers. + +if IOMMU_PT +config IOMMU_PT_AMDV1 + tristate "IOMMU page table for 64-bit AMD IOMMU v1" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the AMD v1 page table. AMDv1 is the + "host" page table. It supports granular page sizes of almost every + power of 2 and decodes an full 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. +endif endif diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile new file mode 100644 index 000000000000..a4d83b7e0cf6 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 + +define create_format +obj-$(2) += iommu_$(1).o + +endef + +$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y))) +$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m))) diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h new file mode 100644 index 000000000000..7423ed71417d --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/amdv1.h @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * AMD IOMMU v1 page table + * + * This is described in Section "2.2.3 I/O Page Tables for Host Translations" + * of the "AMD I/O Virtualization Technology (IOMMU) Specification" + * + * Note the level numbering here matches the core code, so level 0 is the same + * as mode 1. + * + */ +#ifndef __GENERIC_PT_FMT_AMDV1_H +#define __GENERIC_PT_FMT_AMDV1_H + +#include "defs_amdv1.h" +#include "../pt_defs.h" + +#include +#include +#include +#include +#include +#include +#include + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 64, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 5, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* The DTE only has these bits for the top phyiscal address */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* PTE bits */ +enum { + AMDV1PT_FMT_PR = BIT(0), + AMDV1PT_FMT_D = BIT(6), + AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9), + AMDV1PT_FMT_OA = GENMASK_ULL(51, 12), + AMDV1PT_FMT_FC = BIT_ULL(60), + AMDV1PT_FMT_IR = BIT_ULL(61), + AMDV1PT_FMT_IW = BIT_ULL(62), +}; + +/* + * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make + * these defines to avoid it. + */ +#define AMDV1PT_FMT_NL_DEFAULT 0 +#define AMDV1PT_FMT_NL_SIZE 7 + +static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ); +} +#define pt_table_pa amdv1pt_table_pa + +/* Returns the oa for the start of the contiguous entry */ +static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + pt_oaddr_t oa; + + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + oa = FIELD_GET(AMDV1PT_FMT_OA, entry); + + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) { + unsigned int sz_bits = oaffz(oa); + + oa = oalog2_set_mod(oa, 0, sz_bits); + } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) != + AMDV1PT_FMT_NL_DEFAULT)) + return 0; + return oalog2_mul(oa, PT_GRANULE_LG2SZ); +} +#define pt_entry_oa amdv1pt_entry_oa + +static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts) +{ + /* + * Table 15: Page Table Level Parameters + * The top most level cannot have translation entries + */ + return pts->level < PT_MAX_TOP_LEVEL; +} +#define pt_can_have_leaf amdv1pt_can_have_leaf + +/* Body in pt_fmt_defaults.h */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +static inline unsigned int +amdv1pt_entry_num_contig_lg2(const struct pt_state *pts) +{ + u32 code; + + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) == + AMDV1PT_FMT_NL_DEFAULT) + return ilog2(1); + + PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) != + AMDV1PT_FMT_NL_SIZE); + + /* + * The contiguous size is encoded in the length of a string of 1's in + * the low bits of the OA. Reverse the equation: + * code = log2_to_int(num_contig_lg2 + item_lg2sz - + * PT_GRANULE_LG2SZ - 1) - 1 + * Which can be expressed as: + * num_contig_lg2 = oalog2_ffz(code) + 1 - + * item_lg2sz - PT_GRANULE_LG2SZ + * + * Assume the bit layout is correct and remove the masking. Reorganize + * the equation to move all the arithmetic before the ffz. + */ + code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 + + pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ); + return ffz_t(u32, code); +} +#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2 + +static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts) +{ + /* + * Top entry covers bits [63:57] only, this is handled through + * max_vasz_lg2. + */ + if (PT_WARN_ON(pts->level == 5)) + return 7; + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 amdv1pt_num_items_lg2 + +static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!amdv1pt_can_have_leaf(pts)) + return 0; + + /* + * Table 14: Example Page Size Encodings + * Address bits 51:32 can be used to encode page sizes greater than 4 + * Gbytes. Address bits 63:52 are zero-extended. + * + * 512GB Pages are not supported due to a hardware bug. + * Otherwise every power of two size is supported. + */ + return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1), + isz_lg2) & ~SZ_512G; +} +#define pt_possible_sizes amdv1pt_possible_sizes + +static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64) + pts->index; + unsigned int next_level; + u64 entry; + + pts->entry = entry = READ_ONCE(*tablep); + if (!(entry & AMDV1PT_FMT_PR)) + return PT_ENTRY_EMPTY; + + next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry); + if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT || + next_level == AMDV1PT_FMT_NL_SIZE) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw amdv1pt_load_entry_raw + +static inline void +amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = AMDV1PT_FMT_PR | + FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + + if (oasz_lg2 == isz_lg2) { + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, + AMDV1PT_FMT_NL_DEFAULT); + WRITE_ONCE(*tablep, entry); + } else { + unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, + AMDV1PT_FMT_NL_SIZE) | + FIELD_PREP(AMDV1PT_FMT_OA, + oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ - + 1) - + 1); + + /* See amdv1pt_clear_entries() */ + if (num_contig_lg2 <= ilog2(32)) { + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, entry); + } else { + memset64(tablep, entry, log2_to_int(num_contig_lg2)); + } + } + pts->entry = entry; +} +#define pt_install_leaf_entry amdv1pt_install_leaf_entry + +static inline bool amdv1pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + /* + * IR and IW are ANDed from the table levels along with the PTE. We + * always control permissions from the PTE, so always set IR and IW for + * tables. + */ + entry = AMDV1PT_FMT_PR | + FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) | + FIELD_PREP(AMDV1PT_FMT_OA, + log2_div(table_pa, PT_GRANULE_LG2SZ)) | + AMDV1PT_FMT_IR | AMDV1PT_FMT_IW; + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table amdv1pt_install_table + +static inline void amdv1pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = + pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW); +} +#define pt_attr_from_entry amdv1pt_attr_from_entry + +static inline void amdv1pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + /* + * gcc generates rep stos for the io-pgtable code, and this difference + * can show in microbenchmarks with larger contiguous page sizes. + * rep is slower for small cases. + */ + if (num_contig_lg2 <= ilog2(32)) { + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); + } else { + memset64(tablep, 0, log2_to_int(num_contig_lg2)); + } +} +#define pt_clear_entries amdv1pt_clear_entries + +static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts) +{ + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); + u64 *tablep = pt_cur_table(pts, u64) + + log2_set_mod(pts->index, 0, num_contig_lg2); + u64 *end = tablep + log2_to_int(num_contig_lg2); + + for (; tablep != end; tablep++) + if (READ_ONCE(*tablep) & AMDV1PT_FMT_D) + return true; + return false; +} +#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty + +static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts) +{ + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); + u64 *tablep = pt_cur_table(pts, u64) + + log2_set_mod(pts->index, 0, num_contig_lg2); + u64 *end = tablep + log2_to_int(num_contig_lg2); + + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D); +} +#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean + +static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | AMDV1PT_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty + +/* --- iommu */ +#include +#include + +#define pt_iommu_table pt_iommu_amdv1 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_amdv1, iommu) + ->amdpt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu; +} + +static inline int amdv1pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE)) + pte |= AMDV1PT_FMT_FC; + if (iommu_prot & IOMMU_READ) + pte |= AMDV1PT_FMT_IR; + if (iommu_prot & IOMMU_WRITE) + pte |= AMDV1PT_FMT_IW; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot amdv1pt_iommu_set_prot + +static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table, + const struct pt_iommu_amdv1_cfg *cfg) +{ + struct pt_amdv1 *table = &iommu_table->amdpt; + unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; + + if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL) + return -EINVAL; + + if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) && + cfg->starting_level != PT_MAX_TOP_LEVEL) + max_vasz_lg2 = PT_GRANULE_LG2SZ + + (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) * + (cfg->starting_level + 1); + + table->common.max_vasz_lg2 = + min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2); + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + pt_top_set_level(&table->common, cfg->starting_level); + return 0; +} +#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init + +static inline void +amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, + const struct pt_range *top_range, + struct pt_iommu_amdv1_hw_info *info) +{ + info->host_pt_root = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK); + info->mode = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_amdv1.h b/drivers/iommu/generic_pt/fmt/defs_amdv1.h new file mode 100644 index 000000000000..0b9614ca6d10 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_amdv1.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H +#define __GENERIC_PT_FMT_DEFS_AMDV1_H + +#include +#include + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct amdv1pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs amdv1pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_amdv1.c b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c new file mode 100644 index 000000000000..72a2337d0c55 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT amdv1 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \ + BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) +#define PT_FORCE_ENABLED_FEATURES \ + (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) + +#include "iommu_template.h" diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index e69a75511313..21e33489cbf2 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -132,4 +132,23 @@ enum pt_features { PT_FEAT_FMT_START, }; +struct pt_amdv1 { + struct pt_common common; +}; + +enum { + /* + * The memory backing the tables is encrypted. Use __sme_set() to adjust + * the page table pointers in the tree. This only works with + * CONFIG_AMD_MEM_ENCRYPT. + */ + PT_FEAT_AMDV1_ENCRYPT_TABLES = PT_FEAT_FMT_START, + /* + * The PTEs are set to prevent cache incoherent traffic, such as PCI no + * snoop. This is set either at creation time or before the first map + * operation. + */ + PT_FEAT_AMDV1_FORCE_COHERENCE, +}; + #endif diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index defa96abc497..dc731fe003d1 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -145,6 +145,18 @@ struct pt_iommu_cfg { static_assert(offsetof(s, pt_iommu_memb.domain) == \ offsetof(s, domain_memb)) +struct pt_iommu_amdv1_cfg { + struct pt_iommu_cfg common; + unsigned int starting_level; +}; + +struct pt_iommu_amdv1_hw_info { + u64 host_pt_root; + u8 mode; +}; + +IOMMU_FORMAT(amdv1, amdpt); + #undef IOMMU_PROTOTYPES #undef IOMMU_FORMAT #endif -- cgit v1.2.3 From 9d4c274cd7d5e1b6b9e116e155f16bcd208237d8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:03 -0400 Subject: iommupt: Add iova_to_phys op iova_to_phys is a performance path for the DMA API and iommufd, implement it using an unrolled get_user_pages() like function waterfall scheme. The implementation itself is fairly trivial. Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 105 ++++++++++++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 19 +++++-- 2 files changed, 119 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 564f2d3a6e11..5ff1b887928a 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -17,6 +17,111 @@ #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) +static int make_range_ul(struct pt_common *common, struct pt_range *range, + unsigned long iova, unsigned long len) +{ + unsigned long last; + + if (unlikely(len == 0)) + return -EINVAL; + + if (check_add_overflow(iova, len - 1, &last)) + return -EOVERFLOW; + + *range = pt_make_range(common, iova, last); + if (sizeof(iova) > sizeof(range->va)) { + if (unlikely(range->va != iova || range->last_va != last)) + return -EOVERFLOW; + } + return 0; +} + +static __maybe_unused int make_range_u64(struct pt_common *common, + struct pt_range *range, u64 iova, + u64 len) +{ + if (unlikely(iova > ULONG_MAX || len > ULONG_MAX)) + return -EOVERFLOW; + return make_range_ul(common, range, iova, len); +} + +/* + * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch + * to the correct validation based on the type. + */ +#define make_range_no_check(common, range, iova, len) \ + ({ \ + int ret; \ + if (sizeof(iova) > sizeof(unsigned long) || \ + sizeof(len) > sizeof(unsigned long)) \ + ret = make_range_u64(common, range, iova, len); \ + else \ + ret = make_range_ul(common, range, iova, len); \ + ret; \ + }) + +#define make_range(common, range, iova, len) \ + ({ \ + int ret = make_range_no_check(common, range, iova, len); \ + if (!ret) \ + ret = pt_check_range(range); \ + ret; \ + }) + +static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table, + pt_level_fn_t descend_fn) +{ + struct pt_state pts = pt_init(range, level, table); + pt_oaddr_t *res = arg; + + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, descend_fn); + case PT_ENTRY_OA: + *res = pt_entry_oa_exact(&pts); + return 0; + } + return -ENOENT; +} +PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys); + +/** + * iova_to_phys() - Return the output address for the given IOVA + * @iommu_table: Table to query + * @iova: IO virtual address to query + * + * Determine the output address from the given IOVA. @iova may have any + * alignment, the returned physical will be adjusted with any sub page offset. + * + * Context: The caller must hold a read range lock that includes @iova. + * + * Return: 0 if there is no translation for the given iova. + */ +phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_range range; + pt_oaddr_t res; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); + if (ret) + return ret; + + ret = pt_walk_range(&range, __iova_to_phys, &res); + /* PHYS_ADDR_MAX would be a better error code */ + if (ret) + return 0; + return res; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); + struct pt_iommu_collect_args { struct iommu_pages_list free_list; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index dc731fe003d1..5622856e1998 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -116,11 +116,13 @@ struct pt_iommu_cfg { }; /* Generate the exported function signatures from iommu_pt.h */ -#define IOMMU_PROTOTYPES(fmt) \ - int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ - const struct pt_iommu_##fmt##_cfg *cfg, \ - gfp_t gfp); \ - void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \ +#define IOMMU_PROTOTYPES(fmt) \ + phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ + dma_addr_t iova); \ + int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ + const struct pt_iommu_##fmt##_cfg *cfg, \ + gfp_t gfp); \ + void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \ struct pt_iommu_##fmt##_hw_info *info) #define IOMMU_FORMAT(fmt, member) \ struct pt_iommu_##fmt { \ @@ -129,6 +131,13 @@ struct pt_iommu_cfg { }; \ IOMMU_PROTOTYPES(fmt) +/* + * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the + * iommu_pt + */ +#define IOMMU_PT_DOMAIN_OPS(fmt) \ + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, + /* * The driver should setup its domain struct like * union { -- cgit v1.2.3 From 7c53f4238aa8bfb476e177263133ead2eeb8d55d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:04 -0400 Subject: iommupt: Add unmap_pages op unmap_pages removes mappings and any fully contained interior tables from the given range. This follows the now-standard iommu_domain API definition where it does not split up larger page sizes into smaller. The caller must perform unmap only on ranges created by map or it must have somehow otherwise determined safe cut points (eg iommufd/vfio use iova_to_phys to scan for them) A future work will provide 'cut' which explicitly does the page size split if the HW can support it. unmap is implemented with a recursive descent of the tree. If the caller provides a VA range that spans an entire table item then the table memory can be freed as well. If an entire table item can be freed then this version will also check the leaf-only level of the tree to ensure that all entries are present to generate -EINVAL. Many of the existing drivers don't do this extra check. This version sits under the iommu_domain_ops as unmap_pages() but does not require the external page size calculation. The implementation is actually unmap_range() and can do arbitrary ranges, internally handling all the validation and supporting any arrangment of page sizes. A future series can optimize __iommu_unmap() to take advantage of this. Freed page table memory is batched up in the gather and will be freed in the driver's iotlb_sync() callback after the IOTLB flush completes. Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 156 ++++++++++++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 10 ++- 2 files changed, 164 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 5ff1b887928a..e3d1b272723d 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -14,6 +14,29 @@ #include #include #include "../iommu-pages.h" +#include +#include + +static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, + struct pt_iommu *iommu_table, pt_vaddr_t iova, + pt_vaddr_t len, + struct iommu_pages_list *free_list) +{ + struct pt_common *common = common_from_iommu(iommu_table); + + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && + iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { + iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); + /* + * Note that the sync frees the gather's free list, so we must + * not have any pages on that list that are covered by iova/len + */ + } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) { + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); + } + + iommu_pages_list_splice(free_list, &iotlb_gather->freelist); +} #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) @@ -164,6 +187,139 @@ static inline struct pt_table_p *table_alloc_top(struct pt_common *common, log2_to_int(pt_top_memsize_lg2(common, top_of_table))); } +struct pt_unmap_args { + struct iommu_pages_list free_list; + pt_vaddr_t unmapped; +}; + +static __maybe_unused int __unmap_range(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_unmap_args *unmap = arg; + unsigned int num_oas = 0; + unsigned int start_index; + int ret = 0; + + _pt_iter_first(&pts); + start_index = pts.index; + pts.type = pt_load_entry_raw(&pts); + /* + * A starting index is in the middle of a contiguous entry + * + * The IOMMU API does not require drivers to support unmapping parts of + * large pages. Long ago VFIO would try to split maps but the current + * version never does. + * + * Instead when unmap reaches a partial unmap of the start of a large + * IOPTE it should remove the entire IOPTE and return that size to the + * caller. + */ + if (pts.type == PT_ENTRY_OA) { + if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts))) + return -EINVAL; + /* Micro optimization */ + goto start_oa; + } + + do { + if (pts.type != PT_ENTRY_OA) { + bool fully_covered; + + if (pts.type != PT_ENTRY_TABLE) { + ret = -EINVAL; + break; + } + + if (pts.index != start_index) + pt_index_to_va(&pts); + pts.table_lower = pt_table_ptr(&pts); + + fully_covered = pt_entry_fully_covered( + &pts, pt_table_item_lg2sz(&pts)); + + ret = pt_descend(&pts, arg, __unmap_range); + if (ret) + break; + + /* + * If the unmapping range fully covers the table then we + * can free it as well. The clear is delayed until we + * succeed in clearing the lower table levels. + */ + if (fully_covered) { + iommu_pages_list_add(&unmap->free_list, + pts.table_lower); + pt_clear_entries(&pts, ilog2(1)); + } + pts.index++; + } else { + unsigned int num_contig_lg2; +start_oa: + /* + * If the caller requested an last that falls within a + * single entry then the entire entry is unmapped and + * the length returned will be larger than requested. + */ + num_contig_lg2 = pt_entry_num_contig_lg2(&pts); + pt_clear_entries(&pts, num_contig_lg2); + num_oas += log2_to_int(num_contig_lg2); + pts.index += log2_to_int(num_contig_lg2); + } + if (pts.index >= pts.end_index) + break; + pts.type = pt_load_entry_raw(&pts); + } while (true); + + unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts)); + return ret; +} + +/** + * unmap_pages() - Make a range of IOVA empty/not present + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_pages() will remove a translation created by map_pages(). It cannot + * subdivide a mapping created by map_pages(), so it should be called with IOVA + * ranges that match those passed to map_pages(). The IOVA range can aggregate + * contiguous map_pages() calls so long as no individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the point + * unmapping stopped. + */ +size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( + unmap.free_list) }; + pt_vaddr_t len = pgsize * pgcount; + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, len); + if (ret) + return 0; + + pt_walk_range(&range, __unmap_range, &unmap); + + gather_range_pages(iotlb_gather, iommu_table, iova, len, + &unmap.free_list); + + return unmap.unmapped; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); + static void NS(get_info)(struct pt_iommu *iommu_table, struct pt_iommu_info *info) { diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 5622856e1998..ceb6bc9cea37 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -9,6 +9,7 @@ #include #include +struct iommu_iotlb_gather; struct pt_iommu_ops; /** @@ -119,6 +120,10 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ + size_t pt_iommu_##fmt##_unmap_pages( \ + struct iommu_domain *domain, unsigned long iova, \ + size_t pgsize, size_t pgcount, \ + struct iommu_iotlb_gather *iotlb_gather); \ int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ const struct pt_iommu_##fmt##_cfg *cfg, \ gfp_t gfp); \ @@ -135,8 +140,9 @@ struct pt_iommu_cfg { * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the * iommu_pt */ -#define IOMMU_PT_DOMAIN_OPS(fmt) \ - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, +#define IOMMU_PT_DOMAIN_OPS(fmt) \ + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ + .unmap_pages = &pt_iommu_##fmt##_unmap_pages /* * The driver should setup its domain struct like -- cgit v1.2.3 From dcd6a011a8d523a114af2360a8753de5bd60c139 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:05 -0400 Subject: iommupt: Add map_pages op map is slightly complicated because it has to handle a number of special edge cases: - Overmapping a previously shared, but now empty, table level with an OA. Requries validating and freeing the possibly empty tables - Doing the above across an entire to-be-created contiguous entry - Installing a new shared table level concurrently with another thread - Expanding the table by adding more top levels Table expansion is a unique feature of AMDv1, this version is quite similar except we handle racing concurrent lockless map. The table top pointer and starting level are encoded in a single uintptr_t which ensures we can READ_ONCE() without tearing. Any op will do the READ_ONCE() and use that fixed point as its starting point. Concurrent expansion is handled with a table global spinlock. When inserting a new table entry map checks that the entire portion of the table is empty. This includes freeing any empty lower tables that will be overwritten by an OA. A separate free list is used while checking and collecting all the empty lower tables so that writing the new entry is uninterrupted, either the new entry fully writes or nothing changes. A special fast path for PAGE_SIZE is implemented that does a direct walk to the leaf level and installs a single entry. This gives ~15% improvement for iommu_map() when mapping lists of single pages. This version sits under the iommu_domain_ops as map_pages() but does not require the external page size calculation. The implementation is actually map_range() and can do arbitrary ranges, internally handling all the validation and supporting any arrangment of page sizes. A future series can optimize iommu_map() to take advantage of this. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 501 +++++++++++++++++++++++++++++++++++- drivers/iommu/generic_pt/pt_iter.h | 2 +- include/linux/generic_pt/iommu.h | 59 +++++ 3 files changed, 560 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index e3d1b272723d..f32e81509f4f 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -91,6 +91,23 @@ static __maybe_unused int make_range_u64(struct pt_common *common, ret; \ }) +static inline unsigned int compute_best_pgsize(struct pt_state *pts, + pt_oaddr_t oa) +{ + struct pt_iommu *iommu_table = iommu_from_common(pts->range->common); + + if (!pt_can_have_leaf(pts)) + return 0; + + /* + * The page size is limited by the domain's bitmap. This allows the core + * code to reduce the supported page sizes by changing the bitmap. + */ + return pt_compute_best_pgsize(pt_possible_sizes(pts) & + iommu_table->domain.pgsize_bitmap, + pts->range->va, pts->range->last_va, oa); +} + static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg, unsigned int level, struct pt_table_p *table, @@ -147,6 +164,8 @@ EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); struct pt_iommu_collect_args { struct iommu_pages_list free_list; + /* Fail if any OAs are within the range */ + u8 check_mapped : 1; }; static int __collect_tables(struct pt_range *range, void *arg, @@ -156,7 +175,7 @@ static int __collect_tables(struct pt_range *range, void *arg, struct pt_iommu_collect_args *collect = arg; int ret; - if (!pt_can_have_table(&pts)) + if (!collect->check_mapped && !pt_can_have_table(&pts)) return 0; for_each_pt_level_entry(&pts) { @@ -167,6 +186,8 @@ static int __collect_tables(struct pt_range *range, void *arg, return ret; continue; } + if (pts.type == PT_ENTRY_OA && collect->check_mapped) + return -EADDRINUSE; } return 0; } @@ -187,6 +208,477 @@ static inline struct pt_table_p *table_alloc_top(struct pt_common *common, log2_to_int(pt_top_memsize_lg2(common, top_of_table))); } +/* Allocate an interior table */ +static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts, + gfp_t gfp) +{ + struct pt_iommu *iommu_table = + iommu_from_common(parent_pts->range->common); + struct pt_state child_pts = + pt_init(parent_pts->range, parent_pts->level - 1, NULL); + + return iommu_alloc_pages_node_sz( + iommu_table->nid, gfp, + log2_to_int(pt_num_items_lg2(&child_pts) + + ilog2(PT_ITEM_WORD_SIZE))); +} + +static inline int pt_iommu_new_table(struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + struct pt_table_p *table_mem; + phys_addr_t phys; + + /* Given PA/VA/length can't be represented */ + if (PT_WARN_ON(!pt_can_have_table(pts))) + return -ENXIO; + + table_mem = table_alloc(pts, attrs->gfp); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + + phys = virt_to_phys(table_mem); + if (!pt_install_table(pts, phys, attrs)) { + iommu_free_pages(table_mem); + return -EAGAIN; + } + + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { + /* + * The underlying table can't store the physical table address. + * This happens when kunit testing tables outside their normal + * environment where a CPU might be limited. + */ + pt_load_single_entry(pts); + if (PT_WARN_ON(pt_table_pa(pts) != phys)) { + pt_clear_entries(pts, ilog2(1)); + iommu_free_pages(table_mem); + return -EINVAL; + } + } + + pts->table_lower = table_mem; + return 0; +} + +struct pt_iommu_map_args { + struct iommu_iotlb_gather *iotlb_gather; + struct pt_write_attrs attrs; + pt_oaddr_t oa; + unsigned int leaf_pgsize_lg2; + unsigned int leaf_level; +}; + +/* + * This will recursively check any tables in the block to validate they are + * empty and then free them through the gather. + */ +static int clear_contig(const struct pt_state *start_pts, + struct iommu_iotlb_gather *iotlb_gather, + unsigned int step, unsigned int pgsize_lg2) +{ + struct pt_iommu *iommu_table = + iommu_from_common(start_pts->range->common); + struct pt_range range = *start_pts->range; + struct pt_state pts = + pt_init(&range, start_pts->level, start_pts->table); + struct pt_iommu_collect_args collect = { .check_mapped = true }; + int ret; + + pts.index = start_pts->index; + pts.end_index = start_pts->index + step; + for (; _pt_iter_load(&pts); pt_next_entry(&pts)) { + if (pts.type == PT_ENTRY_TABLE) { + collect.free_list = + IOMMU_PAGES_LIST_INIT(collect.free_list); + ret = pt_walk_descend_all(&pts, __collect_tables, + &collect); + if (ret) + return ret; + + /* + * The table item must be cleared before we can update + * the gather + */ + pt_clear_entries(&pts, ilog2(1)); + + iommu_pages_list_add(&collect.free_list, + pt_table_ptr(&pts)); + gather_range_pages( + iotlb_gather, iommu_table, range.va, + log2_to_int(pt_table_item_lg2sz(&pts)), + &collect.free_list); + } else if (pts.type != PT_ENTRY_EMPTY) { + return -EADDRINUSE; + } + } + return 0; +} + +static int __map_range_leaf(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; + unsigned int start_index; + pt_oaddr_t oa = map->oa; + unsigned int step; + bool need_contig; + int ret = 0; + + PT_WARN_ON(map->leaf_level != level); + PT_WARN_ON(!pt_can_have_leaf(&pts)); + + step = log2_to_int_t(unsigned int, + leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts)); + need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts); + + _pt_iter_first(&pts); + start_index = pts.index; + do { + pts.type = pt_load_entry_raw(&pts); + if (pts.type != PT_ENTRY_EMPTY || need_contig) { + if (pts.index != start_index) + pt_index_to_va(&pts); + ret = clear_contig(&pts, map->iotlb_gather, step, + leaf_pgsize_lg2); + if (ret) + break; + } + + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { + pt_index_to_va(&pts); + PT_WARN_ON(compute_best_pgsize(&pts, oa) != + leaf_pgsize_lg2); + } + pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs); + + oa += log2_to_int(leaf_pgsize_lg2); + pts.index += step; + } while (pts.index < pts.end_index); + + map->oa = oa; + return ret; +} + +static int __map_range(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + int ret; + + PT_WARN_ON(map->leaf_level == level); + PT_WARN_ON(!pt_can_have_table(&pts)); + + _pt_iter_first(&pts); + + /* Descend to a child table */ + do { + pts.type = pt_load_entry_raw(&pts); + + if (pts.type != PT_ENTRY_TABLE) { + if (pts.type != PT_ENTRY_EMPTY) + return -EADDRINUSE; + ret = pt_iommu_new_table(&pts, &map->attrs); + if (ret) { + /* + * Racing with another thread installing a table + */ + if (ret == -EAGAIN) + continue; + return ret; + } + } else { + pts.table_lower = pt_table_ptr(&pts); + } + + /* + * The already present table can possibly be shared with another + * concurrent map. + */ + if (map->leaf_level == level - 1) + ret = pt_descend(&pts, arg, __map_range_leaf); + else + ret = pt_descend(&pts, arg, __map_range); + if (ret) + return ret; + + pts.index++; + pt_index_to_va(&pts); + if (pts.index >= pts.end_index) + break; + } while (true); + return 0; +} + +/* + * Fast path for the easy case of mapping a 4k page to an already allocated + * table. This is a common workload. If it returns EAGAIN run the full algorithm + * instead. + */ +static __always_inline int __do_map_single_page(struct pt_range *range, + void *arg, unsigned int level, + struct pt_table_p *table, + pt_level_fn_t descend_fn) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + + pts.type = pt_load_single_entry(&pts); + if (level == 0) { + if (pts.type != PT_ENTRY_EMPTY) + return -EADDRINUSE; + pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT, + &map->attrs); + map->oa += PAGE_SIZE; + return 0; + } + if (pts.type == PT_ENTRY_TABLE) + return pt_descend(&pts, arg, descend_fn); + /* Something else, use the slow path */ + return -EAGAIN; +} +PT_MAKE_LEVELS(__map_single_page, __do_map_single_page); + +/* + * Add a table to the top, increasing the top level as much as necessary to + * encompass range. + */ +static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, + struct pt_iommu_map_args *map) +{ + struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list); + struct pt_common *common = common_from_iommu(iommu_table); + uintptr_t top_of_table = READ_ONCE(common->top_of_table); + uintptr_t new_top_of_table = top_of_table; + struct pt_table_p *table_mem; + unsigned int new_level; + spinlock_t *domain_lock; + unsigned long flags; + int ret; + + while (true) { + struct pt_range top_range = + _pt_top_range(common, new_top_of_table); + struct pt_state pts = pt_init_top(&top_range); + + top_range.va = range->va; + top_range.last_va = range->last_va; + + if (!pt_check_range(&top_range) && map->leaf_level <= pts.level) + break; + + pts.level++; + if (pts.level > PT_MAX_TOP_LEVEL || + pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) { + ret = -ERANGE; + goto err_free; + } + + new_level = pts.level; + table_mem = table_alloc_top( + common, _pt_top_set(NULL, pts.level), map->attrs.gfp); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + iommu_pages_list_add(&free_list, table_mem); + + /* The new table links to the lower table always at index 0 */ + top_range.va = 0; + top_range.top_level = new_level; + pts.table_lower = pts.table; + pts.table = table_mem; + pt_load_single_entry(&pts); + PT_WARN_ON(pts.index != 0); + pt_install_table(&pts, virt_to_phys(pts.table_lower), + &map->attrs); + new_top_of_table = _pt_top_set(pts.table, pts.level); + } + + /* + * top_of_table is write locked by the spinlock, but readers can use + * READ_ONCE() to get the value. Since we encode both the level and the + * pointer in one quanta the lockless reader will always see something + * valid. The HW must be updated to the new level under the spinlock + * before top_of_table is updated so that concurrent readers don't map + * into the new level until it is fully functional. If another thread + * already updated it while we were working then throw everything away + * and try again. + */ + domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table); + spin_lock_irqsave(domain_lock, flags); + if (common->top_of_table != top_of_table) { + spin_unlock_irqrestore(domain_lock, flags); + ret = -EAGAIN; + goto err_free; + } + + /* + * We do not issue any flushes for change_top on the expectation that + * any walk cache will not become a problem by adding another layer to + * the tree. Misses will rewalk from the updated top pointer, hits + * continue to be correct. Negative caching is fine too since all the + * new IOVA added by the new top is non-present. + */ + iommu_table->driver_ops->change_top( + iommu_table, virt_to_phys(table_mem), new_level); + WRITE_ONCE(common->top_of_table, new_top_of_table); + spin_unlock_irqrestore(domain_lock, flags); + return 0; + +err_free: + iommu_put_pages_list(&free_list); + return ret; +} + +static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range, + struct pt_iommu_map_args *map) +{ + struct pt_common *common = common_from_iommu(iommu_table); + int ret; + + do { + ret = pt_check_range(range); + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + return ret; + + if (!ret && map->leaf_level <= range->top_level) + break; + + ret = increase_top(iommu_table, range, map); + if (ret && ret != -EAGAIN) + return ret; + + /* Reload the new top */ + *range = pt_make_range(common, range->va, range->last_va); + } while (ret); + PT_WARN_ON(pt_check_range(range)); + return 0; +} + +static int do_map(struct pt_range *range, bool single_page, + struct pt_iommu_map_args *map) +{ + if (single_page) { + int ret; + + ret = pt_walk_range(range, __map_single_page, map); + if (ret != -EAGAIN) + return ret; + /* EAGAIN falls through to the full path */ + } + + if (map->leaf_level == range->top_level) + return pt_walk_range(range, __map_range_leaf, map); + return pt_walk_range(range, __map_range, map); +} + +/** + * map_pages() - Install translation for an IOVA range + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @paddr: Physical/Output address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO + * @gfp: GFP flags for any memory allocations + * @mapped: Total bytes successfully mapped + * + * The range starting at IOVA will have paddr installed into it. The caller + * must specify a valid pgsize and pgcount to segment the range into compatible + * blocks. + * + * On error the caller will probably want to invoke unmap on the range from iova + * up to the amount indicated by @mapped to return the table back to an + * unchanged state. + * + * Context: The caller must hold a write range lock that includes the whole + * range. + * + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were + * mapped are added to @mapped, @mapped is not zerod first. + */ +int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; + struct pt_common *common = common_from_iommu(iommu_table); + struct iommu_iotlb_gather iotlb_gather; + pt_vaddr_t len = pgsize * pgcount; + struct pt_iommu_map_args map = { + .iotlb_gather = &iotlb_gather, + .oa = paddr, + .leaf_pgsize_lg2 = vaffs(pgsize), + }; + bool single_page = false; + struct pt_range range; + int ret; + + iommu_iotlb_gather_init(&iotlb_gather); + + if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE)))) + return -EINVAL; + + /* Check the paddr doesn't exceed what the table can store */ + if ((sizeof(pt_oaddr_t) < sizeof(paddr) && + (pt_vaddr_t)paddr > PT_VADDR_MAX) || + (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 && + oalog2_div(paddr, common->max_oasz_lg2))) + return -ERANGE; + + ret = pt_iommu_set_prot(common, &map.attrs, prot); + if (ret) + return ret; + map.attrs.gfp = gfp; + + ret = make_range_no_check(common, &range, iova, len); + if (ret) + return ret; + + /* Calculate target page size and level for the leaves */ + if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && + pgcount == 1) { + PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); + if (log2_mod(iova | paddr, PAGE_SHIFT)) + return -ENXIO; + map.leaf_pgsize_lg2 = PAGE_SHIFT; + map.leaf_level = 0; + single_page = true; + } else { + map.leaf_pgsize_lg2 = pt_compute_best_pgsize( + pgsize_bitmap, range.va, range.last_va, paddr); + if (!map.leaf_pgsize_lg2) + return -ENXIO; + map.leaf_level = + pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); + } + + ret = check_map_range(iommu_table, &range, &map); + if (ret) + return ret; + + PT_WARN_ON(map.leaf_level > range.top_level); + + ret = do_map(&range, single_page, &map); + + /* + * Table levels were freed and replaced with large items, flush any walk + * cache that may refer to the freed levels. + */ + if (!iommu_pages_list_empty(&iotlb_gather.freelist)) + iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather); + + /* Bytes successfully mapped */ + PT_WARN_ON(!ret && map.oa - paddr != len); + *mapped += map.oa - paddr; + return ret; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); + struct pt_unmap_args { struct iommu_pages_list free_list; pt_vaddr_t unmapped; @@ -445,6 +937,7 @@ static void pt_iommu_zero(struct pt_iommu_table *fmt_table) memset_after(fmt_table, 0, iommu.domain); /* The caller can initialize some of these values */ + iommu_table->driver_ops = cfg.driver_ops; iommu_table->nid = cfg.nid; } @@ -478,6 +971,12 @@ int pt_iommu_init(struct pt_iommu_table *fmt_table, if (ret) return ret; + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + WARN_ON(!iommu_table->driver_ops || + !iommu_table->driver_ops->change_top || + !iommu_table->driver_ops->get_top_lock)) + return -EINVAL; + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && (pt_feature(common, PT_FEAT_FULL_VA) || pt_feature(common, PT_FEAT_DYNAMIC_TOP))) diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h index 87f4a26c1a41..c0d8617cce29 100644 --- a/drivers/iommu/generic_pt/pt_iter.h +++ b/drivers/iommu/generic_pt/pt_iter.h @@ -612,7 +612,7 @@ static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg, * This builds a function call tree that can be fully inlined. * The caller must provide a function body in an __always_inline function:: * - * static __always_inline int do(struct pt_range *range, void *arg, + * static __always_inline int do_fn(struct pt_range *range, void *arg, * unsigned int level, struct pt_table_p *table, * pt_level_fn_t descend_fn) * diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index ceb6bc9cea37..0d59423024d5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -11,6 +11,7 @@ struct iommu_iotlb_gather; struct pt_iommu_ops; +struct pt_iommu_driver_ops; /** * DOC: IOMMU Radix Page Table @@ -43,6 +44,12 @@ struct pt_iommu { */ const struct pt_iommu_ops *ops; + /** + * @driver_ops: Function pointers provided by the HW driver to help + * manage HW details like caches. + */ + const struct pt_iommu_driver_ops *driver_ops; + /** * @nid: Node ID to use for table memory allocations. The IOMMU driver * may want to set the NID to the device's NID, if there are multiple @@ -84,6 +91,53 @@ struct pt_iommu_ops { void (*deinit)(struct pt_iommu *iommu_table); }; +/** + * struct pt_iommu_driver_ops - HW IOTLB cache flushing operations + * + * The IOMMU driver should implement these using container_of(iommu_table) to + * get to it's iommu_domain derived structure. All ops can be called in atomic + * contexts as they are buried under DMA API calls. + */ +struct pt_iommu_driver_ops { + /** + * @change_top: Update the top of table pointer + * @iommu_table: Table to operate on + * @top_paddr: New CPU physical address of the top pointer + * @top_level: IOMMU PT level of the new top + * + * Called under the get_top_lock() spinlock. The driver must update all + * HW references to this domain with a new top address and + * configuration. On return mappings placed in the new top must be + * reachable by the HW. + * + * top_level encodes the level in IOMMU PT format, level 0 is the + * smallest page size increasing from there. This has to be translated + * to any HW specific format. During this call the new top will not be + * visible to any other API. + * + * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if + * enabled. + */ + void (*change_top)(struct pt_iommu *iommu_table, phys_addr_t top_paddr, + unsigned int top_level); + + /** + * @get_top_lock: lock to hold when changing the table top + * @iommu_table: Table to operate on + * + * Return a lock to hold when changing the table top page table from + * being stored in HW. The lock will be held prior to calling + * change_top() and released once the top is fully visible. + * + * Typically this would be a lock that protects the iommu_domain's + * attachment list. + * + * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if + * enabled. + */ + spinlock_t *(*get_top_lock)(struct pt_iommu *iommu_table); +}; + static inline void pt_iommu_deinit(struct pt_iommu *iommu_table) { /* @@ -120,6 +174,10 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ + int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ + unsigned long iova, phys_addr_t paddr, \ + size_t pgsize, size_t pgcount, \ + int prot, gfp_t gfp, size_t *mapped); \ size_t pt_iommu_##fmt##_unmap_pages( \ struct iommu_domain *domain, unsigned long iova, \ size_t pgsize, size_t pgcount, \ @@ -142,6 +200,7 @@ struct pt_iommu_cfg { */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ + .map_pages = &pt_iommu_##fmt##_map_pages, \ .unmap_pages = &pt_iommu_##fmt##_unmap_pages /* -- cgit v1.2.3 From 4a00f943489103b4b9edff9f39bd484efbfb15fa Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:06 -0400 Subject: iommupt: Add read_and_clear_dirty op IOMMU HW now supports updating a dirty bit in an entry when a DMA writes to the entry's VA range. iommufd has a uAPI to read and clear the dirty bits from the tables. This is a trivial recursive descent algorithm to read and optionally clear the dirty bits. The format needs a function to tell if a contiguous entry is dirty, and a function to clear a contiguous entry back to clean. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 104 ++++++++++++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 6 +++ 2 files changed, 110 insertions(+) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index f32e81509f4f..448c5796d4a8 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -162,6 +162,108 @@ phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain, } EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); +struct pt_iommu_dirty_args { + struct iommu_dirty_bitmap *dirty; + unsigned int flags; +}; + +static void record_dirty(struct pt_state *pts, + struct pt_iommu_dirty_args *dirty, + unsigned int num_contig_lg2) +{ + pt_vaddr_t dirty_len; + + if (num_contig_lg2 != ilog2(1)) { + unsigned int index = pts->index; + unsigned int end_index = log2_set_mod_max_t( + unsigned int, pts->index, num_contig_lg2); + + /* Adjust for being contained inside a contiguous page */ + end_index = min(end_index, pts->end_index); + dirty_len = (end_index - index) * + log2_to_int(pt_table_item_lg2sz(pts)); + } else { + dirty_len = log2_to_int(pt_table_item_lg2sz(pts)); + } + + if (dirty->dirty->bitmap) + iova_bitmap_set(dirty->dirty->bitmap, pts->range->va, + dirty_len); + + if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) { + pt_entry_make_write_clean(pts); + iommu_iotlb_gather_add_range(dirty->dirty->gather, + pts->range->va, dirty_len); + } +} + +static inline int __read_and_clear_dirty(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_dirty_args *dirty = arg; + int ret; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + ret = pt_descend(&pts, arg, __read_and_clear_dirty); + if (ret) + return ret; + continue; + } + if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts)) + record_dirty(&pts, dirty, + pt_entry_num_contig_lg2(&pts)); + } + return 0; +} + +/** + * read_and_clear_dirty() - Manipulate the HW set write dirty state + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @size: Length of the IOVA + * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR + * @dirty: Place to store the dirty bits + * + * Iterate over all the entries in the mapped range and record their write dirty + * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then + * the entries will be left dirty, otherwise they are returned to being not + * write dirty. + * + * Context: The caller must hold a read range lock that includes @iova. + * + * Returns: -ERRNO on failure, 0 on success. + */ +int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_iommu_dirty_args dirty_args = { + .dirty = dirty, + .flags = flags, + }; + struct pt_range range; + int ret; + +#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty) + return -EOPNOTSUPP; +#endif + + ret = make_range(common_from_iommu(iommu_table), &range, iova, size); + if (ret) + return ret; + + ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args); + PT_WARN_ON(ret); + return ret; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU"); + struct pt_iommu_collect_args { struct iommu_pages_list free_list; /* Fail if any OAs are within the range */ @@ -1015,5 +1117,7 @@ EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW)); MODULE_IMPORT_NS("GENERIC_PT"); +/* For iommu_dirty_bitmap_record() */ +MODULE_IMPORT_NS("IOMMUFD"); #endif /* __GENERIC_PT_IOMMU_PT_H */ diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 0d59423024d5..03a906fbe12a 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -12,6 +12,7 @@ struct iommu_iotlb_gather; struct pt_iommu_ops; struct pt_iommu_driver_ops; +struct iommu_dirty_bitmap; /** * DOC: IOMMU Radix Page Table @@ -182,6 +183,9 @@ struct pt_iommu_cfg { struct iommu_domain *domain, unsigned long iova, \ size_t pgsize, size_t pgcount, \ struct iommu_iotlb_gather *iotlb_gather); \ + int pt_iommu_##fmt##_read_and_clear_dirty( \ + struct iommu_domain *domain, unsigned long iova, size_t size, \ + unsigned long flags, struct iommu_dirty_bitmap *dirty); \ int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ const struct pt_iommu_##fmt##_cfg *cfg, \ gfp_t gfp); \ @@ -202,6 +206,8 @@ struct pt_iommu_cfg { .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ .map_pages = &pt_iommu_##fmt##_map_pages, \ .unmap_pages = &pt_iommu_##fmt##_unmap_pages +#define IOMMU_PT_DIRTY_OPS(fmt) \ + .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty /* * The driver should setup its domain struct like -- cgit v1.2.3 From e5359dcc617a2174d834bab4083340196615d8bd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:08 -0400 Subject: iommupt: Add a mock pagetable format for iommufd selftest to use The iommufd self test uses an xarray to store the pfns and their orders to emulate a page table. Slightly modify the amdv1 page table to create a real page table that has similar properties: - 2k base granule to simulate something like a 4k page table on a 64K PAGE_SIZE ARM system - Contiguous page support for every PFN order - Dirty tracking AMDv1 is the closest format, as it is the only one that already supports every page size. Tweak it to have only 5 levels and an 11 bit base granule and compile it separately as a format variant. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/fmt/Makefile | 1 + drivers/iommu/generic_pt/fmt/amdv1.h | 18 ++++++++++++++++-- drivers/iommu/generic_pt/fmt/iommu_mock.c | 10 ++++++++++ include/linux/generic_pt/iommu.h | 6 ++++++ 4 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 drivers/iommu/generic_pt/fmt/iommu_mock.c (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile index 32f3956c7509..f0c22cf5f7be 100644 --- a/drivers/iommu/generic_pt/fmt/Makefile +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 +iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock IOMMU_PT_KUNIT_TEST := define create_format diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h index aaf76bfd21da..aa8e1a8ec95f 100644 --- a/drivers/iommu/generic_pt/fmt/amdv1.h +++ b/drivers/iommu/generic_pt/fmt/amdv1.h @@ -26,11 +26,23 @@ #include enum { - PT_MAX_OUTPUT_ADDRESS_LG2 = 52, - PT_MAX_VA_ADDRESS_LG2 = 64, PT_ITEM_WORD_SIZE = sizeof(u64), + /* + * The IOMMUFD selftest uses the AMDv1 format with some alterations It + * uses a 2k page size to test cases where the CPU page size is not the + * same. + */ +#ifdef AMDV1_IOMMUFD_SELFTEST + PT_MAX_VA_ADDRESS_LG2 = 56, + PT_MAX_OUTPUT_ADDRESS_LG2 = 51, + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 11, +#else + PT_MAX_VA_ADDRESS_LG2 = 64, + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, PT_MAX_TOP_LEVEL = 5, PT_GRANULE_LG2SZ = 12, +#endif PT_TABLEMEM_LG2SZ = 12, /* The DTE only has these bits for the top phyiscal address */ @@ -374,6 +386,7 @@ static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table, } #define pt_iommu_fmt_init amdv1pt_iommu_fmt_init +#ifndef PT_FMT_VARIANT static inline void amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, const struct pt_range *top_range, @@ -384,6 +397,7 @@ amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, info->mode = top_range->top_level + 1; } #define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info +#endif #if defined(GENERIC_PT_KUNIT) static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = { diff --git a/drivers/iommu/generic_pt/fmt/iommu_mock.c b/drivers/iommu/generic_pt/fmt/iommu_mock.c new file mode 100644 index 000000000000..74e597cba9d9 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_mock.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define AMDV1_IOMMUFD_SELFTEST 1 +#define PT_FMT amdv1 +#define PT_FMT_VARIANT mock +#define PT_SUPPORTED_FEATURES 0 + +#include "iommu_template.h" diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 03a906fbe12a..848a5fb76272 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -237,6 +237,12 @@ struct pt_iommu_amdv1_hw_info { IOMMU_FORMAT(amdv1, amdpt); +/* amdv1_mock is used by the iommufd selftest */ +#define pt_iommu_amdv1_mock pt_iommu_amdv1 +#define pt_iommu_amdv1_mock_cfg pt_iommu_amdv1_cfg +struct pt_iommu_amdv1_mock_hw_info; +IOMMU_PROTOTYPES(amdv1_mock); + #undef IOMMU_PROTOTYPES #undef IOMMU_FORMAT #endif -- cgit v1.2.3 From e93d5945ed5bb086431e83eed7ab98b6c058cc0b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:09 -0400 Subject: iommufd: Change the selftest to use iommupt instead of xarray The iommufd self test uses an xarray to store the pfns and their orders to emulate a page table. Make it act more like a real iommu driver by replacing the xarray with an iommupt based page table. The new AMDv1 mock format behaves similarly to the xarray. Add set_dirty() as a iommu_pt operation to allow the test suite to simulate HW dirty. Userspace can select between several formats including the normal AMDv1 format and a special MOCK_IOMMUPT_HUGE variation for testing huge page dirty tracking. To make the dirty tracking test work the page table must only store exactly 2M huge pages otherwise the logic the test uses fails. They cannot be broken up or combined. Aside from aligning the selftest with a real page table implementation, this helps test the iommupt code itself. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 39 +++ drivers/iommu/iommufd/Kconfig | 1 + drivers/iommu/iommufd/iommufd_test.h | 11 +- drivers/iommu/iommufd/selftest.c | 424 +++++++++++--------------- include/linux/generic_pt/iommu.h | 12 + tools/testing/selftests/iommu/iommufd.c | 60 ++-- tools/testing/selftests/iommu/iommufd_utils.h | 12 + 7 files changed, 282 insertions(+), 277 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 448c5796d4a8..142001f5aa83 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -264,6 +264,41 @@ int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain, } EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU"); +static inline int __set_dirty(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, __set_dirty); + case PT_ENTRY_OA: + if (!pt_entry_make_write_dirty(&pts)) + return -EAGAIN; + return 0; + } + return -ENOENT; +} + +static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table, + dma_addr_t iova) +{ + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); + if (ret) + return ret; + + /* + * Note: There is no locking here yet, if the test suite races this it + * can crash. It should use RCU locking eventually. + */ + return pt_walk_range(&range, __set_dirty, NULL); +} + struct pt_iommu_collect_args { struct iommu_pages_list free_list; /* Fail if any OAs are within the range */ @@ -957,6 +992,10 @@ static void NS(deinit)(struct pt_iommu *iommu_table) } static const struct pt_iommu_ops NS(ops) = { +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ + IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) + .set_dirty = NS(set_dirty), +#endif .get_info = NS(get_info), .deinit = NS(deinit), }; diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 2beeb4f60ee5..eae3f03629b0 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -41,6 +41,7 @@ config IOMMUFD_TEST depends on DEBUG_KERNEL depends on FAULT_INJECTION depends on RUNTIME_TESTING_MENU + depends on IOMMU_PT_AMDV1 select IOMMUFD_DRIVER default n help diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 8fc618b2bcf9..781a75c79eea 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -31,9 +31,18 @@ enum { IOMMU_TEST_OP_PASID_CHECK_HWPT, }; +enum { + MOCK_IOMMUPT_DEFAULT = 0, + MOCK_IOMMUPT_HUGE, + MOCK_IOMMUPT_AMDV1, +}; + +/* These values are true for MOCK_IOMMUPT_DEFAULT */ enum { MOCK_APERTURE_START = 1UL << 24, MOCK_APERTURE_LAST = (1UL << 31) - 1, + MOCK_PAGE_SIZE = 2048, + MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE, }; enum { @@ -52,7 +61,6 @@ enum { enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, - MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1, MOCK_FLAGS_DEVICE_PASID = 1 << 2, }; @@ -205,6 +213,7 @@ struct iommu_test_hw_info { */ struct iommu_hwpt_selftest { __u32 iotlb; + __u32 pagetable_type; }; /* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */ diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 5661d2da2b67..f6379f387d3b 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include "../iommu-pages.h" #include "../iommu-priv.h" #include "io_pagetable.h" @@ -41,21 +43,6 @@ static DEFINE_IDA(mock_dev_ida); enum { MOCK_DIRTY_TRACK = 1, - MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, - MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE, - - /* - * Like a real page table alignment requires the low bits of the address - * to be zero. xarray also requires the high bit to be zero, so we store - * the pfns shifted. The upper bits are used for metadata. - */ - MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, - - _MOCK_PFN_START = MOCK_PFN_MASK + 1, - MOCK_PFN_START_IOVA = _MOCK_PFN_START, - MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, - MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, - MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2, }; static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain); @@ -124,10 +111,15 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, } struct mock_iommu_domain { + union { + struct iommu_domain domain; + struct pt_iommu iommu; + struct pt_iommu_amdv1 amdv1; + }; unsigned long flags; - struct iommu_domain domain; - struct xarray pfns; }; +PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain); static inline struct mock_iommu_domain * to_mock_domain(struct iommu_domain *domain) @@ -344,74 +336,6 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, return 0; } -static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock, - unsigned long iova, size_t page_size, - unsigned long flags) -{ - unsigned long cur, end = iova + page_size - 1; - bool dirty = false; - void *ent, *old; - - for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) { - ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); - if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) - continue; - - dirty = true; - /* Clear dirty */ - if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { - unsigned long val; - - val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - } - } - - return dirty; -} - -static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct mock_iommu_domain *mock = to_mock_domain(domain); - unsigned long end = iova + size; - void *ent; - - if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) - return -EINVAL; - - do { - unsigned long pgsize = MOCK_IO_PAGE_SIZE; - unsigned long head; - - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - if (!ent) { - iova += pgsize; - continue; - } - - if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA) - pgsize = MOCK_HUGE_PAGE_SIZE; - head = iova & ~(pgsize - 1); - - /* Clear dirty */ - if (mock_test_and_clear_dirty(mock, head, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -static const struct iommu_dirty_ops dirty_ops = { - .set_dirty_tracking = mock_domain_set_dirty_tracking, - .read_and_clear_dirty = mock_domain_read_and_clear_dirty, -}; - static struct mock_iommu_domain_nested * __mock_domain_alloc_nested(const struct iommu_user_data *user_data) { @@ -446,7 +370,7 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); - if (!parent || parent->ops != mock_ops.default_domain_ops) + if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING)) return ERR_PTR(-EINVAL); mock_parent = to_mock_domain(parent); @@ -459,159 +383,170 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, return &mock_nested->domain; } -static struct iommu_domain * -mock_domain_alloc_paging_flags(struct device *dev, u32 flags, - const struct iommu_user_data *user_data) -{ - bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | - IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_PASID; - struct mock_dev *mdev = to_mock_dev(dev); - bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; - struct mock_iommu_domain *mock; - - if (user_data) - return ERR_PTR(-EOPNOTSUPP); - if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) - return ERR_PTR(-EOPNOTSUPP); - - mock = kzalloc(sizeof(*mock), GFP_KERNEL); - if (!mock) - return ERR_PTR(-ENOMEM); - mock->domain.geometry.aperture_start = MOCK_APERTURE_START; - mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; - mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; - if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) - mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; - mock->domain.ops = mock_ops.default_domain_ops; - mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - xa_init(&mock->pfns); - - if (has_dirty_flag) - mock->domain.dirty_ops = &dirty_ops; - return &mock->domain; -} - static void mock_domain_free(struct iommu_domain *domain) { struct mock_iommu_domain *mock = to_mock_domain(domain); - WARN_ON(!xa_empty(&mock->pfns)); + pt_iommu_deinit(&mock->iommu); kfree(mock); } -static int mock_domain_map_pages(struct iommu_domain *domain, - unsigned long iova, phys_addr_t paddr, - size_t pgsize, size_t pgcount, int prot, - gfp_t gfp, size_t *mapped) +static void mock_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - unsigned long flags = MOCK_PFN_START_IOVA; - unsigned long start_iova = iova; + iommu_put_pages_list(&gather->freelist); +} - /* - * xarray does not reliably work with fault injection because it does a - * retry allocation, so put our own failure point. - */ - if (iommufd_should_fail()) - return -ENOENT; +static const struct iommu_domain_ops amdv1_mock_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1_mock), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); - for (; pgcount; pgcount--) { - size_t cur; +static const struct iommu_domain_ops amdv1_mock_huge_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1_mock), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; +#undef pt_iommu_amdv1_mock_map_pages - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { - void *old; +static const struct iommu_dirty_ops amdv1_mock_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1_mock), + .set_dirty_tracking = mock_domain_set_dirty_tracking, +}; - if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) - flags = MOCK_PFN_LAST_IOVA; - if (pgsize != MOCK_IO_PAGE_SIZE) { - flags |= MOCK_PFN_HUGE_IOVA; - } - old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, - xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | - flags), - gfp); - if (xa_is_err(old)) { - for (; start_iova != iova; - start_iova += MOCK_IO_PAGE_SIZE) - xa_erase(&mock->pfns, - start_iova / - MOCK_IO_PAGE_SIZE); - return xa_err(old); - } - WARN_ON(old); - iova += MOCK_IO_PAGE_SIZE; - paddr += MOCK_IO_PAGE_SIZE; - *mapped += MOCK_IO_PAGE_SIZE; - flags = 0; - } - } - return 0; -} +static const struct iommu_domain_ops amdv1_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; + +static const struct iommu_dirty_ops amdv1_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1), + .set_dirty_tracking = mock_domain_set_dirty_tracking, +}; -static size_t mock_domain_unmap_pages(struct iommu_domain *domain, - unsigned long iova, size_t pgsize, - size_t pgcount, - struct iommu_iotlb_gather *iotlb_gather) +static struct mock_iommu_domain * +mock_domain_alloc_pgtable(struct device *dev, + const struct iommu_hwpt_selftest *user_cfg, u32 flags) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - bool first = true; - size_t ret = 0; - void *ent; + struct mock_iommu_domain *mock; + int rc; - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + mock = kzalloc(sizeof(*mock), GFP_KERNEL); + if (!mock) + return ERR_PTR(-ENOMEM); + mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - for (; pgcount; pgcount--) { - size_t cur; + mock->amdv1.iommu.nid = NUMA_NO_NODE; + + switch (user_cfg->pagetable_type) { + case MOCK_IOMMUPT_DEFAULT: + case MOCK_IOMMUPT_HUGE: { + struct pt_iommu_amdv1_cfg cfg = {}; + + /* The mock version has a 2k page size */ + cfg.common.hw_max_vasz_lg2 = 56; + cfg.common.hw_max_oasz_lg2 = 51; + cfg.starting_level = 2; + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) + mock->domain.ops = &amdv1_mock_huge_ops; + else + mock->domain.ops = &amdv1_mock_ops; + rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL); + if (rc) + goto err_free; + + /* + * In huge mode userspace should only provide huge pages, we + * have to include PAGE_SIZE for the domain to be accepted by + * iommufd. + */ + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) + mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE | + PAGE_SIZE; + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + mock->domain.dirty_ops = &amdv1_mock_dirty_ops; + break; + } - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { - ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + case MOCK_IOMMUPT_AMDV1: { + struct pt_iommu_amdv1_cfg cfg = {}; + + cfg.common.hw_max_vasz_lg2 = 64; + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); + cfg.starting_level = 2; + mock->domain.ops = &amdv1_ops; + rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL); + if (rc) + goto err_free; + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + mock->domain.dirty_ops = &amdv1_dirty_ops; + break; + } + default: + rc = -EOPNOTSUPP; + goto err_free; + } - /* - * iommufd generates unmaps that must be a strict - * superset of the map's performend So every - * starting/ending IOVA should have been an iova passed - * to map. - * - * This simple logic doesn't work when the HUGE_PAGE is - * turned on since the core code will automatically - * switch between the two page sizes creating a break in - * the unmap calls. The break can land in the middle of - * contiguous IOVA. - */ - if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) { - if (first) { - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_START_IOVA)); - first = false; - } - if (pgcount == 1 && - cur + MOCK_IO_PAGE_SIZE == pgsize) - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_LAST_IOVA)); - } + /* + * Override the real aperture to the MOCK aperture for test purposes. + */ + if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) { + WARN_ON(mock->domain.geometry.aperture_start != 0); + WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST); - iova += MOCK_IO_PAGE_SIZE; - ret += MOCK_IO_PAGE_SIZE; - } + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; } - return ret; + + return mock; +err_free: + kfree(mock); + return ERR_PTR(rc); } -static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, - dma_addr_t iova) +static struct iommu_domain * +mock_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - void *ent; + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_PASID; + struct mock_dev *mdev = to_mock_dev(dev); + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_hwpt_selftest user_cfg = {}; + struct mock_iommu_domain *mock; + int rc; + + if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); + + if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST && + user_data->type != IOMMU_HWPT_DATA_NONE)) + return ERR_PTR(-EOPNOTSUPP); - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - WARN_ON(!ent); - return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; + if (user_data) { + rc = iommu_copy_struct_from_user( + &user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); + } + + mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags); + if (IS_ERR(mock)) + return ERR_CAST(mock); + return &mock->domain; } static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) @@ -955,15 +890,6 @@ static const struct iommu_ops mock_ops = { .user_pasid_table = true, .get_viommu_size = mock_get_viommu_size, .viommu_init = mock_viommu_init, - .default_domain_ops = - &(struct iommu_domain_ops){ - .free = mock_domain_free, - .attach_dev = mock_domain_nop_attach, - .map_pages = mock_domain_map_pages, - .unmap_pages = mock_domain_unmap_pages, - .iova_to_phys = mock_domain_iova_to_phys, - .set_dev_pasid = mock_domain_set_dev_pasid_nop, - }, }; static void mock_domain_free_nested(struct iommu_domain *domain) @@ -1047,7 +973,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, if (IS_ERR(hwpt)) return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || - hwpt->domain->ops != mock_ops.default_domain_ops) { + hwpt->domain->owner != &mock_ops) { iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } @@ -1088,7 +1014,6 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) {}, }; const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY | - MOCK_FLAGS_DEVICE_HUGE_IOVA | MOCK_FLAGS_DEVICE_PASID; struct mock_dev *mdev; int rc, i; @@ -1277,23 +1202,25 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, { struct iommufd_hw_pagetable *hwpt; struct mock_iommu_domain *mock; + unsigned int page_size; uintptr_t end; int rc; - if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || - (uintptr_t)uptr % MOCK_IO_PAGE_SIZE || - check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) - return -EINVAL; - hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - for (; length; length -= MOCK_IO_PAGE_SIZE) { + page_size = 1 << __ffs(mock->domain.pgsize_bitmap); + if (iova % page_size || length % page_size || + (uintptr_t)uptr % page_size || + check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) + return -EINVAL; + + for (; length; length -= page_size) { struct page *pages[1]; + phys_addr_t io_phys; unsigned long pfn; long npages; - void *ent; npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, pages); @@ -1308,15 +1235,14 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, pfn = page_to_pfn(pages[0]); put_page(pages[0]); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - if (!ent || - (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != - pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { + io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova); + if (io_phys != + pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { rc = -EINVAL; goto out_put; } - iova += MOCK_IO_PAGE_SIZE; - uptr += MOCK_IO_PAGE_SIZE; + iova += page_size; + uptr += page_size; } rc = 0; @@ -1795,7 +1721,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - if (!(mock->flags & MOCK_DIRTY_TRACK)) { + if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) { rc = -EINVAL; goto out_put; } @@ -1814,22 +1740,10 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, } for (i = 0; i < max; i++) { - unsigned long cur = iova + i * page_size; - void *ent, *old; - if (!test_bit(i, (unsigned long *)tmp)) continue; - - ent = xa_load(&mock->pfns, cur / page_size); - if (ent) { - unsigned long val; - - val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / page_size, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - count++; - } + mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size); + count++; } cmd->dirty.out_nr_dirty = count; @@ -2202,3 +2116,5 @@ void iommufd_test_exit(void) platform_device_unregister(selftest_iommu_dev); debugfs_remove_recursive(dbgfs_root); } + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 848a5fb76272..f2a763aba088 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -73,6 +73,18 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @set_dirty: Make the iova write dirty + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * + * This is only used by iommufd testing. It makes the iova dirty so that + * read_and_clear_dirty() will see it as dirty. Unlike all the other ops + * this one is safe to call without holding any locking. It may return + * -EAGAIN if there is a race. + */ + int (*set_dirty)(struct pt_iommu *iommu_table, dma_addr_t iova); + /** * @get_info: Return the pt_iommu_info structure * @iommu_table: Table to query diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 3eebf5e3b974..595b0a3ead64 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -13,9 +13,6 @@ static unsigned long HUGEPAGE_SIZE; -#define MOCK_PAGE_SIZE (PAGE_SIZE / 2) -#define MOCK_HUGE_PAGE_SIZE (512 * MOCK_PAGE_SIZE) - static unsigned long get_huge_page_size(void) { char buf[80]; @@ -2058,6 +2055,12 @@ FIXTURE_VARIANT(iommufd_dirty_tracking) FIXTURE_SETUP(iommufd_dirty_tracking) { + struct iommu_option cmd = { + .size = sizeof(cmd), + .option_id = IOMMU_OPTION_HUGE_PAGES, + .op = IOMMU_OPTION_OP_SET, + .val64 = 0, + }; size_t mmap_buffer_size; unsigned long size; int mmap_flags; @@ -2066,7 +2069,7 @@ FIXTURE_SETUP(iommufd_dirty_tracking) if (variant->buffer_size < MOCK_PAGE_SIZE) { SKIP(return, - "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%lu", + "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%u", variant->buffer_size, MOCK_PAGE_SIZE); } @@ -2114,16 +2117,18 @@ FIXTURE_SETUP(iommufd_dirty_tracking) assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); test_ioctl_ioas_alloc(&self->ioas_id); - /* Enable 1M mock IOMMU hugepages */ - if (variant->hugepages) { - test_cmd_mock_domain_flags(self->ioas_id, - MOCK_FLAGS_DEVICE_HUGE_IOVA, - &self->stdev_id, &self->hwpt_id, - &self->idev_id); - } else { - test_cmd_mock_domain(self->ioas_id, &self->stdev_id, - &self->hwpt_id, &self->idev_id); - } + + /* + * For dirty testing it is important that the page size fed into + * the iommu page tables matches the size the dirty logic + * expects, or set_dirty can touch too much stuff. + */ + cmd.object_id = self->ioas_id; + if (!variant->hugepages) + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, + &self->idev_id); } FIXTURE_TEARDOWN(iommufd_dirty_tracking) @@ -2248,18 +2253,23 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability) TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) { uint32_t page_size = MOCK_PAGE_SIZE; + uint32_t ioas_id = self->ioas_id; uint32_t hwpt_id; - uint32_t ioas_id; if (variant->hugepages) page_size = MOCK_HUGE_PAGE_SIZE; - test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); - test_cmd_hwpt_alloc(self->idev_id, ioas_id, - IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + if (variant->hugepages) + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_HUGE, &hwpt_id); + else + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_DEFAULT, &hwpt_id); test_cmd_set_dirty_tracking(hwpt_id, true); @@ -2285,18 +2295,24 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) { uint32_t page_size = MOCK_PAGE_SIZE; + uint32_t ioas_id = self->ioas_id; uint32_t hwpt_id; - uint32_t ioas_id; if (variant->hugepages) page_size = MOCK_HUGE_PAGE_SIZE; - test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); - test_cmd_hwpt_alloc(self->idev_id, ioas_id, - IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + + if (variant->hugepages) + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_HUGE, &hwpt_id); + else + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_DEFAULT, &hwpt_id); test_cmd_set_dirty_tracking(hwpt_id, true); diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 772ca1db6e59..08e529fde1cc 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -215,6 +215,18 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \ hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \ 0)) +#define test_cmd_hwpt_alloc_iommupt(device_id, pt_id, flags, iommupt_type, \ + hwpt_id) \ + ({ \ + struct iommu_hwpt_selftest user_cfg = { \ + .pagetable_type = iommupt_type \ + }; \ + \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc( \ + self->fd, device_id, pt_id, 0, flags, \ + hwpt_id, IOMMU_HWPT_DATA_SELFTEST, \ + &user_cfg, sizeof(user_cfg))); \ + }) #define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \ self->fd, device_id, pt_id, 0, flags, \ -- cgit v1.2.3 From aef5de756ea871ab44e3a1a87be6c944e6587c51 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:10 -0400 Subject: iommupt: Add the x86 64 bit page table format This is used by x86 CPUs and can be used in AMD/VT-d x86 IOMMUs. When a x86 IOMMU is running SVA the MM will be using this format. This implementation follows the AMD v2 io-pgtable version. There is nothing remarkable here, the format can have 4 or 5 levels and limited support for different page sizes. No contiguous pages support. x86 uses a sign extension mechanism where the top bits of the VA must match the sign bit. The core code supports this through PT_FEAT_SIGN_EXTEND which creates and upper and lower VA range. All the new operations will work correctly in both spaces, however currently there is no way to report the upper space to other layers. Future patches can improve that. In principle this can support 3 page tables levels matching the 32 bit PAE table format, but no iommu driver needs this. The focus is on the modern 64 bit 4 and 5 level formats. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 71,61 , 66,58 , -13.13 2^21, 66,60 , 61,55 , -10.10 2^30, 59,56 , 56,54 , -3.03 256*2^12, 392,1360 , 345,1289 , 73.73 256*2^21, 383,1159 , 335,1145 , 70.70 256*2^30, 378,965 , 331,892 , 62.62 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 77,71 , 73,68 , -7.07 2^21, 76,70 , 70,66 , -6.06 2^30, 69,66 , 66,63 , -4.04 256*2^12, 225,899 , 210,870 , 75.75 256*2^21, 262,722 , 248,710 , 65.65 256*2^30, 251,643 , 244,634 , 61.61 The small -ve values in the iommu_unmap() are due to the core code calling iommu_pgsize() before invoking the domain op. This is unncessary with this implementation. Future work optimizes this and gets to 2%, 4%, 3%. Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/.kunitconfig | 1 + drivers/iommu/generic_pt/Kconfig | 11 ++ drivers/iommu/generic_pt/fmt/Makefile | 2 + drivers/iommu/generic_pt/fmt/defs_x86_64.h | 21 +++ drivers/iommu/generic_pt/fmt/iommu_x86_64.c | 11 ++ drivers/iommu/generic_pt/fmt/x86_64.h | 255 ++++++++++++++++++++++++++++ include/linux/generic_pt/common.h | 13 ++ include/linux/generic_pt/iommu.h | 11 ++ 8 files changed, 325 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/defs_x86_64.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_x86_64.c create mode 100644 drivers/iommu/generic_pt/fmt/x86_64.h (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig index 936c327f0661..2016c5e5ac0f 100644 --- a/drivers/iommu/generic_pt/.kunitconfig +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -3,6 +3,7 @@ CONFIG_GENERIC_PT=y CONFIG_DEBUG_GENERIC_PT=y CONFIG_IOMMU_PT=y CONFIG_IOMMU_PT_AMDV1=y +CONFIG_IOMMU_PT_X86_64=y CONFIG_IOMMU_PT_KUNIT_TEST=y CONFIG_IOMMUFD=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index 81652cd9c69f..6dcb771b3c58 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -42,10 +42,21 @@ config IOMMU_PT_AMDV1 Selected automatically by an IOMMU driver that uses this format. +config IOMMU_PT_X86_64 + tristate "IOMMU page table for x86 64-bit, 4/5 levels" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the x86 64-bit 4/5 level page table. + It supports 4K/2M/1G page sizes and can decode a sign-extended + portion of the 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + config IOMMU_PT_KUNIT_TEST tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS depends on KUNIT depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 + depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 default KUNIT_ALL_TESTS help Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile index f0c22cf5f7be..5a3379107999 100644 --- a/drivers/iommu/generic_pt/fmt/Makefile +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -3,6 +3,8 @@ iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock +iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 + IOMMU_PT_KUNIT_TEST := define create_format obj-$(2) += iommu_$(1).o diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h new file mode 100644 index 000000000000..6f589e1f55d3 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H +#define __GENERIC_PT_FMT_DEFS_X86_64_H + +#include +#include + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct x86_64_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs x86_64_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c new file mode 100644 index 000000000000..5c5960d871a3 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT x86_64 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ + BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h new file mode 100644 index 000000000000..18d736d14b2d --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/x86_64.h @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * x86 page table. Supports the 4 and 5 level variations. + * + * The 4 and 5 level version is described in: + * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software + * Developer's Manual Volume 3 + * + * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization + * Technology for Directed I/O Architecture Specification" + * + * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O + * Virtualization Technology (IOMMU) Specification" + * + * It is used by x86 CPUs, AMD and VT-d IOMMU HW. + * + * Note the 3 level format is very similar and almost implemented here. The + * reserved/ignored layout is different and there are functional bit + * differences. + * + * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower + * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign + * extended addressing with this page table format. + * + * The named levels in the spec map to the pts->level as: + * Table/PTE - 0 + * Directory/PDE - 1 + * Directory Ptr/PDPTE - 2 + * PML4/PML4E - 3 + * PML5/PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_X86_64_H +#define __GENERIC_PT_FMT_X86_64_H + +#include "defs_x86_64.h" +#include "../pt_defs.h" + +#include +#include +#include +#include + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* + * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k + * aligned and is limited by the architected HAW + */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* Shared descriptor bits */ +enum { + X86_64_FMT_P = BIT(0), + X86_64_FMT_RW = BIT(1), + X86_64_FMT_U = BIT(2), + X86_64_FMT_A = BIT(5), + X86_64_FMT_D = BIT(6), + X86_64_FMT_OA = GENMASK_ULL(51, 12), + X86_64_FMT_XD = BIT_ULL(63), +}; + +/* PDPTE/PDE */ +enum { + X86_64_FMT_PS = BIT(7), +}; + +static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa x86_64_pt_table_pa + +static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa x86_64_pt_entry_oa + +static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf x86_64_pt_can_have_leaf + +static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 x86_64_pt_num_items_lg2 + +static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!(entry & X86_64_FMT_P)) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw x86_64_pt_load_entry_raw + +static inline void +x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = X86_64_FMT_P | + FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= X86_64_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry x86_64_pt_install_leaf_entry + +static inline bool x86_64_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table x86_64_pt_install_table + +static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + X86_64_FMT_D | X86_64_FMT_XD); +} +#define pt_attr_from_entry x86_64_pt_attr_from_entry + +/* --- iommu */ +#include +#include + +#define pt_iommu_table pt_iommu_x86_64 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->x86_64_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, x86_64_pt.common) + ->iommu; +} + +static inline int x86_64_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte; + + pte = X86_64_FMT_U | X86_64_FMT_A | X86_64_FMT_D; + if (iommu_prot & IOMMU_WRITE) + pte |= X86_64_FMT_RW; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot x86_64_pt_iommu_set_prot + +static inline int +x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table, + const struct pt_iommu_x86_64_cfg *cfg) +{ + struct pt_x86_64 *table = &iommu_table->x86_64_pt; + + if (cfg->common.hw_max_vasz_lg2 < 31 || + cfg->common.hw_max_vasz_lg2 > 57) + return -EINVAL; + + /* Top of 2, 3, 4 */ + pt_top_set_level(&table->common, + (cfg->common.hw_max_vasz_lg2 - 31) / 9 + 2); + + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + return 0; +} +#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init + +static inline void +x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table, + const struct pt_range *top_range, + struct pt_iommu_x86_64_hw_info *info) +{ + info->gcr3_pt = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK); + info->levels = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = { + [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 48 }, + [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 57 }, + /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */ + [2] = { .common.hw_max_vasz_lg2 = 47 }, + [3] = { .common.hw_max_vasz_lg2 = 56 }, +}; +#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)}; +#endif +#endif diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 21e33489cbf2..96f8a6a7d60e 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -151,4 +151,17 @@ enum { PT_FEAT_AMDV1_FORCE_COHERENCE, }; +struct pt_x86_64 { + struct pt_common common; +}; + +enum { + /* + * The memory backing the tables is encrypted. Use __sme_set() to adjust + * the page table pointers in the tree. This only works with + * CONFIG_AMD_MEM_ENCRYPT. + */ + PT_FEAT_X86_64_AMD_ENCRYPT_TABLES = PT_FEAT_FMT_START, +}; + #endif diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index f2a763aba088..fde7ccf007c5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -255,6 +255,17 @@ IOMMU_FORMAT(amdv1, amdpt); struct pt_iommu_amdv1_mock_hw_info; IOMMU_PROTOTYPES(amdv1_mock); +struct pt_iommu_x86_64_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_x86_64_hw_info { + u64 gcr3_pt; + u8 levels; +}; + +IOMMU_FORMAT(x86_64, x86_64_pt); + #undef IOMMU_PROTOTYPES #undef IOMMU_FORMAT #endif -- cgit v1.2.3 From 2fdf6db436e3071a8e4c9c3e67674448a13860d4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:12 -0400 Subject: iommu/amd: Remove AMD io_pgtable support None of this is used anymore, delete it. Reviewed-by: Alejandro Jimenez Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/amd/Makefile | 2 +- drivers/iommu/amd/amd_iommu_types.h | 98 ------ drivers/iommu/amd/io_pgtable.c | 575 ------------------------------------ drivers/iommu/amd/io_pgtable_v2.c | 370 ----------------------- drivers/iommu/io-pgtable.c | 4 - include/linux/io-pgtable.h | 2 - 6 files changed, 1 insertion(+), 1050 deletions(-) delete mode 100644 drivers/iommu/amd/io_pgtable.c delete mode 100644 drivers/iommu/amd/io_pgtable_v2.c (limited to 'include/linux') diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 59c04a67f398..5412a563c697 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o +obj-y += iommu.o init.o quirks.o ppr.o pasid.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index d90a285b44eb..4b4a37fad70e 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -18,7 +18,6 @@ #include #include #include -#include #include /* @@ -338,76 +337,7 @@ #define GUEST_PGTABLE_4_LEVEL 0x00 #define GUEST_PGTABLE_5_LEVEL 0x01 -#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9)) -#define PM_LEVEL_SIZE(x) (((x) < 6) ? \ - ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ - (0xffffffffffffffffULL)) -#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) -#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL) -#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \ - IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW) -#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL) - -#define PM_MAP_4k 0 #define PM_ADDR_MASK 0x000ffffffffff000ULL -#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \ - (~((1ULL << (12 + ((lvl) * 9))) - 1))) -#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) - -/* - * Returns the page table level to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_LEVEL(pagesize) \ - ((__ffs(pagesize) - 12) / 9) -/* - * Returns the number of ptes to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_PTE_COUNT(pagesize) \ - (1ULL << ((__ffs(pagesize) - 12) % 9)) - -/* - * Aligns a given io-virtual address to a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_ALIGN(address, pagesize) \ - ((address) & ~((pagesize) - 1)) -/* - * Creates an IOMMU PTE for an address and a given pagesize - * The PTE has no permission bits set - * Pagesize is expected to be a power-of-two larger than 4096 - */ -#define PAGE_SIZE_PTE(address, pagesize) \ - (((address) | ((pagesize) - 1)) & \ - (~(pagesize >> 1)) & PM_ADDR_MASK) - -/* - * Takes a PTE value with mode=0x07 and returns the page size it maps - */ -#define PTE_PAGE_SIZE(pte) \ - (1ULL << (1 + ffz(((pte) | 0xfffULL)))) - -/* - * Takes a page-table level and returns the default page-size for this level - */ -#define PTE_LEVEL_PAGE_SIZE(level) \ - (1ULL << (12 + (9 * (level)))) - -/* - * The IOPTE dirty bit - */ -#define IOMMU_PTE_HD_BIT (6) - -/* - * Bit value definition for I/O PTE fields - */ -#define IOMMU_PTE_PR BIT_ULL(0) -#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) -#define IOMMU_PTE_U BIT_ULL(59) -#define IOMMU_PTE_FC BIT_ULL(60) -#define IOMMU_PTE_IR BIT_ULL(61) -#define IOMMU_PTE_IW BIT_ULL(62) /* * Bit value definition for DTE fields @@ -437,12 +367,6 @@ /* DTE[128:179] | DTE[184:191] */ #define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52) -#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) -#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) -#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) -#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) -#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) - #define IOMMU_PROT_MASK 0x03 #define IOMMU_PROT_IR 0x01 #define IOMMU_PROT_IW 0x02 @@ -535,19 +459,6 @@ struct amd_irte_ops; #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0) -#define io_pgtable_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl) - -#define io_pgtable_ops_to_data(x) \ - io_pgtable_to_data(io_pgtable_ops_to_pgtable(x)) - -#define io_pgtable_ops_to_domain(x) \ - container_of(io_pgtable_ops_to_data(x), \ - struct protection_domain, iop) - -#define io_pgtable_cfg_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl.cfg) - struct gcr3_tbl_info { u64 *gcr3_tbl; /* Guest CR3 table */ int glx; /* Number of levels for GCR3 table */ @@ -555,14 +466,6 @@ struct gcr3_tbl_info { u16 domid; /* Per device domain ID */ }; -struct amd_io_pgtable { - seqcount_t seqcount; /* Protects root/mode update */ - struct io_pgtable pgtbl; - int mode; - u64 *root; - u64 *pgd; /* v2 pgtable pgd pointer */ -}; - enum protection_domain_mode { PD_MODE_NONE, PD_MODE_V1, @@ -597,7 +500,6 @@ struct protection_domain { struct pt_iommu_x86_64 amdv2; }; struct list_head dev_list; /* List of all devices in this domain */ - struct amd_io_pgtable iop; spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ enum protection_domain_mode pd_mode; /* Track page table type */ diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c deleted file mode 100644 index f64244938c9a..000000000000 --- a/drivers/iommu/amd/io_pgtable.c +++ /dev/null @@ -1,575 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table allocator. - * - * Copyright (C) 2020 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -/* - * Helper function to get the first pte of a large mapping - */ -static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, - unsigned long *count) -{ - unsigned long pte_mask, pg_size, cnt; - u64 *fpte; - - pg_size = PTE_PAGE_SIZE(*pte); - cnt = PAGE_SIZE_PTE_COUNT(pg_size); - pte_mask = ~((cnt << 3) - 1); - fpte = (u64 *)(((unsigned long)pte) & pte_mask); - - if (page_size) - *page_size = pg_size; - - if (count) - *count = cnt; - - return fpte; -} - -static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl) -{ - u64 *p; - int i; - - for (i = 0; i < 512; ++i) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - /* Large PTE? */ - if (PM_PTE_LEVEL(pt[i]) == 0 || - PM_PTE_LEVEL(pt[i]) == 7) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = IOMMU_PTE_PAGE(pt[i]); - if (lvl > 2) - free_pt_lvl(p, freelist, lvl - 1); - else - iommu_pages_list_add(freelist, p); - } - - iommu_pages_list_add(freelist, pt); -} - -static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist) -{ - switch (mode) { - case PAGE_MODE_NONE: - case PAGE_MODE_7_LEVEL: - break; - case PAGE_MODE_1_LEVEL: - iommu_pages_list_add(freelist, root); - break; - case PAGE_MODE_2_LEVEL: - case PAGE_MODE_3_LEVEL: - case PAGE_MODE_4_LEVEL: - case PAGE_MODE_5_LEVEL: - case PAGE_MODE_6_LEVEL: - free_pt_lvl(root, freelist, mode); - break; - default: - BUG(); - } -} - -/* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned int page_size_level, - gfp_t gfp) -{ - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - struct protection_domain *domain = - container_of(pgtable, struct protection_domain, iop); - unsigned long flags; - bool ret = true; - u64 *pte; - - pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K); - if (!pte) - return false; - - spin_lock_irqsave(&domain->lock, flags); - - if (address <= PM_LEVEL_SIZE(pgtable->mode) && - pgtable->mode - 1 >= page_size_level) - goto out; - - ret = false; - if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level)) - goto out; - - *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); - - write_seqcount_begin(&pgtable->seqcount); - pgtable->root = pte; - pgtable->mode += 1; - write_seqcount_end(&pgtable->seqcount); - - pte = NULL; - ret = true; - -out: - spin_unlock_irqrestore(&domain->lock, flags); - iommu_free_pages(pte); - - return ret; -} - -static u64 *alloc_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long page_size, - u64 **pte_page, - gfp_t gfp, - bool *updated) -{ - unsigned long last_addr = address + (page_size - 1); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - unsigned int seqcount; - int level, end_lvl; - u64 *pte, *page; - - BUG_ON(!is_power_of_2(page_size)); - - while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || - pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { - /* - * Return an error if there is no memory to update the - * page-table. - */ - if (!increase_address_space(pgtable, last_addr, - PAGE_SIZE_LEVEL(page_size), gfp)) - return NULL; - } - - - do { - seqcount = read_seqcount_begin(&pgtable->seqcount); - - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); - - - address = PAGE_SIZE_ALIGN(address, page_size); - end_lvl = PAGE_SIZE_LEVEL(page_size); - - while (level > end_lvl) { - u64 __pte, __npte; - int pte_level; - - __pte = *pte; - pte_level = PM_PTE_LEVEL(__pte); - - /* - * If we replace a series of large PTEs, we need - * to tear down all of them. - */ - if (IOMMU_PTE_PRESENT(__pte) && - pte_level == PAGE_MODE_7_LEVEL) { - unsigned long count, i; - u64 *lpte; - - lpte = first_pte_l7(pte, NULL, &count); - - /* - * Unmap the replicated PTEs that still match the - * original large mapping - */ - for (i = 0; i < count; ++i) - cmpxchg64(&lpte[i], __pte, 0ULL); - - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte) || - pte_level == PAGE_MODE_NONE) { - page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, - SZ_4K); - - if (!page) - return NULL; - - __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); - - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_pages(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - /* No level skipping support yet */ - if (pte_level != level) - return NULL; - - level -= 1; - - pte = IOMMU_PTE_PAGE(__pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long *page_size) -{ - int level; - unsigned int seqcount; - u64 *pte; - - *page_size = 0; - - if (address > PM_LEVEL_SIZE(pgtable->mode)) - return NULL; - - do { - seqcount = read_seqcount_begin(&pgtable->seqcount); - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); - - *page_size = PTE_LEVEL_PAGE_SIZE(level); - - while (level > 0) { - - /* Not Present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Large PTE */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || - PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) - break; - - /* No level skipping support yet */ - if (PM_PTE_LEVEL(*pte) != level) - return NULL; - - level -= 1; - - /* Walk to the next level */ - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; - *page_size = PTE_LEVEL_PAGE_SIZE(level); - } - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) - pte = first_pte_l7(pte, page_size, NULL); - - return pte; -} - -static void free_clear_pte(u64 *pte, u64 pteval, - struct iommu_pages_list *freelist) -{ - u64 *pt; - int mode; - - while (!try_cmpxchg64(pte, &pteval, 0)) - pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); - - if (!IOMMU_PTE_PRESENT(pteval)) - return; - - pt = IOMMU_PTE_PAGE(pteval); - mode = IOMMU_PTE_MODE(pteval); - - free_sub_pt(pt, mode, freelist); -} - -/* - * Generic mapping functions. It maps a physical address into a DMA - * address space. It allocates the page table pages if necessary. - * In the future it can be extended to a generic mapping function - * supporting all features of AMD IOMMU page tables like level skipping - * and full 64 bit address spaces. - */ -static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); - bool updated = false; - u64 __pte, *pte; - int ret, i, count; - size_t size = pgcount << __ffs(pgsize); - unsigned long o_iova = iova; - - BUG_ON(!IS_ALIGNED(iova, pgsize)); - BUG_ON(!IS_ALIGNED(paddr, pgsize)); - - ret = -EINVAL; - if (!(prot & IOMMU_PROT_MASK)) - goto out; - - while (pgcount > 0) { - count = PAGE_SIZE_PTE_COUNT(pgsize); - pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); - - ret = -ENOMEM; - if (!pte) - goto out; - - for (i = 0; i < count; ++i) - free_clear_pte(&pte[i], pte[i], &freelist); - - if (!iommu_pages_list_empty(&freelist)) - updated = true; - - if (count > 1) { - __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); - __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; - } else - __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; - - if (prot & IOMMU_PROT_IR) - __pte |= IOMMU_PTE_IR; - if (prot & IOMMU_PROT_IW) - __pte |= IOMMU_PTE_IW; - - for (i = 0; i < count; ++i) - pte[i] = __pte; - - iova += pgsize; - paddr += pgsize; - pgcount--; - if (mapped) - *mapped += pgsize; - } - - ret = 0; - -out: - if (updated) { - struct protection_domain *dom = io_pgtable_ops_to_domain(ops); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - /* - * Flush domain TLB(s) and wait for completion. Any Device-Table - * Updates and flushing already happened in - * increase_address_space(). - */ - amd_iommu_domain_flush_pages(dom, o_iova, size); - spin_unlock_irqrestore(&dom->lock, flags); - } - - /* Everything flushed out, free pages now */ - iommu_put_pages_list(&freelist); - - return ret; -} - -static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long long unmapped; - unsigned long unmap_size; - u64 *pte; - size_t size = pgcount << __ffs(pgsize); - - BUG_ON(!is_power_of_2(pgsize)); - - unmapped = 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (pte) { - int i, count; - - count = PAGE_SIZE_PTE_COUNT(unmap_size); - for (i = 0; i < count; i++) - pte[i] = 0ULL; - } else { - return unmapped; - } - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, - unsigned long flags) -{ - bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; - bool dirty = false; - int i, count; - - /* - * 2.2.3.2 Host Dirty Support - * When a non-default page size is used , software must OR the - * Dirty bits in all of the replicated host PTEs used to map - * the page. The IOMMU does not guarantee the Dirty bits are - * set in all of the replicated PTEs. Any portion of the page - * may have been written even if the Dirty bit is set in only - * one of the replicated PTEs. - */ - count = PAGE_SIZE_PTE_COUNT(size); - for (i = 0; i < count && test_only; i++) { - if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { - dirty = true; - break; - } - } - - for (i = 0; i < count && !test_only; i++) { - if (test_and_clear_bit(IOMMU_PTE_HD_BIT, - (unsigned long *)&ptep[i])) { - dirty = true; - } - } - - return dirty; -} - -static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long end = iova + size - 1; - - do { - unsigned long pgsize = 0; - u64 *ptep, pte; - - ptep = fetch_pte(pgtable, iova, &pgsize); - if (ptep) - pte = READ_ONCE(*ptep); - if (!ptep || !IOMMU_PTE_PRESENT(pte)) { - pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); - iova += pgsize; - continue; - } - - /* - * Mark the whole IOVA range as dirty even if only one of - * the replicated PTEs were marked dirty. - */ - if (pte_test_and_clear_dirty(ptep, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -/* - * ---------------------------------------------------- - */ -static void v1_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); - - if (pgtable->mode == PAGE_MODE_NONE) - return; - - /* Page-table is not visible to IOMMU anymore, so free it */ - BUG_ON(pgtable->mode < PAGE_MODE_NONE || - pgtable->mode > amd_iommu_hpt_level); - - free_sub_pt(pgtable->root, pgtable->mode, &freelist); - iommu_put_pages_list(&freelist); -} - -static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - - pgtable->root = - iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); - if (!pgtable->root) - return NULL; - pgtable->mode = PAGE_MODE_3_LEVEL; - seqcount_init(&pgtable->seqcount); - - cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; - cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - - pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; - pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; - pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; - pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; - - return &pgtable->pgtbl; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { - .alloc = v1_alloc_pgtable, - .free = v1_free_pgtable, -}; diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c deleted file mode 100644 index b47941353ccb..000000000000 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ /dev/null @@ -1,370 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table v2 allocator. - * - * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit - * Author: Vasant Hegde - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include -#include -#include - -#include - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -#define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */ -#define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */ -#define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */ -#define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */ -#define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */ -#define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */ -#define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */ -#define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */ -#define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */ - -#define MAX_PTRS_PER_PAGE 512 - -#define IOMMU_PAGE_SIZE_2M BIT_ULL(21) -#define IOMMU_PAGE_SIZE_1G BIT_ULL(30) - - -static inline int get_pgtable_level(void) -{ - return amd_iommu_gpt_level; -} - -static inline bool is_large_pte(u64 pte) -{ - return (pte & IOMMU_PAGE_PSE); -} - -static inline u64 set_pgtable_attr(u64 *page) -{ - u64 prot; - - prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER; - prot |= IOMMU_PAGE_ACCESS; - - return (iommu_virt_to_phys(page) | prot); -} - -static inline void *get_pgtable_pte(u64 pte) -{ - return iommu_phys_to_virt(pte & PM_ADDR_MASK); -} - -static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot) -{ - u64 pte; - - pte = __sme_set(paddr & PM_ADDR_MASK); - pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER; - pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY; - - if (prot & IOMMU_PROT_IW) - pte |= IOMMU_PAGE_RW; - - /* Large page */ - if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M) - pte |= IOMMU_PAGE_PSE; - - return pte; -} - -static inline u64 get_alloc_page_size(u64 size) -{ - if (size >= IOMMU_PAGE_SIZE_1G) - return IOMMU_PAGE_SIZE_1G; - - if (size >= IOMMU_PAGE_SIZE_2M) - return IOMMU_PAGE_SIZE_2M; - - return PAGE_SIZE; -} - -static inline int page_size_to_level(u64 pg_size) -{ - if (pg_size == IOMMU_PAGE_SIZE_1G) - return PAGE_MODE_3_LEVEL; - if (pg_size == IOMMU_PAGE_SIZE_2M) - return PAGE_MODE_2_LEVEL; - - return PAGE_MODE_1_LEVEL; -} - -static void free_pgtable(u64 *pt, int level) -{ - u64 *p; - int i; - - for (i = 0; i < MAX_PTRS_PER_PAGE; i++) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - if (is_large_pte(pt[i])) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = get_pgtable_pte(pt[i]); - if (level > 2) - free_pgtable(p, level - 1); - else - iommu_free_pages(p); - } - - iommu_free_pages(pt); -} - -/* Allocate page table */ -static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova, - unsigned long pg_size, gfp_t gfp, bool *updated) -{ - u64 *pte, *page; - int level, end_level; - - level = get_pgtable_level() - 1; - end_level = page_size_to_level(pg_size); - pte = &pgd[PM_LEVEL_INDEX(level, iova)]; - iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE); - - while (level >= end_level) { - u64 __pte, __npte; - - __pte = *pte; - - if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) { - /* Unmap large pte */ - cmpxchg64(pte, *pte, 0ULL); - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte)) { - page = iommu_alloc_pages_node_sz(nid, gfp, SZ_4K); - if (!page) - return NULL; - - __npte = set_pgtable_attr(page); - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_pages(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - level -= 1; - pte = get_pgtable_pte(__pte); - pte = &pte[PM_LEVEL_INDEX(level, iova)]; - } - - /* Tear down existing pte entries */ - if (IOMMU_PTE_PRESENT(*pte)) { - u64 *__pte; - - *updated = true; - __pte = get_pgtable_pte(*pte); - cmpxchg64(pte, *pte, 0ULL); - if (pg_size == IOMMU_PAGE_SIZE_1G) - free_pgtable(__pte, end_level - 1); - else if (pg_size == IOMMU_PAGE_SIZE_2M) - iommu_free_pages(__pte); - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. - * If there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long iova, unsigned long *page_size) -{ - u64 *pte; - int level; - - level = get_pgtable_level() - 1; - pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)]; - /* Default page size is 4K */ - *page_size = PAGE_SIZE; - - while (level) { - /* Not present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Walk to the next level */ - pte = get_pgtable_pte(*pte); - pte = &pte[PM_LEVEL_INDEX(level - 1, iova)]; - - /* Large page */ - if (is_large_pte(*pte)) { - if (level == PAGE_MODE_3_LEVEL) - *page_size = IOMMU_PAGE_SIZE_1G; - else if (level == PAGE_MODE_2_LEVEL) - *page_size = IOMMU_PAGE_SIZE_2M; - else - return NULL; /* Wrongly set PSE bit in PTE */ - - break; - } - - level -= 1; - } - - return pte; -} - -static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - u64 *pte; - unsigned long map_size; - unsigned long mapped_size = 0; - unsigned long o_iova = iova; - size_t size = pgcount << __ffs(pgsize); - int ret = 0; - bool updated = false; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount) - return -EINVAL; - - if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; - - while (mapped_size < size) { - map_size = get_alloc_page_size(pgsize); - pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd, - iova, map_size, gfp, &updated); - if (!pte) { - ret = -ENOMEM; - goto out; - } - - *pte = set_pte_attr(paddr, map_size, prot); - - iova += map_size; - paddr += map_size; - mapped_size += map_size; - } - -out: - if (updated) { - struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); - unsigned long flags; - - spin_lock_irqsave(&pdom->lock, flags); - amd_iommu_domain_flush_pages(pdom, o_iova, size); - spin_unlock_irqrestore(&pdom->lock, flags); - } - - if (mapped) - *mapped += mapped_size; - - return ret; -} - -static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - unsigned long unmap_size; - unsigned long unmapped = 0; - size_t size = pgcount << __ffs(pgsize); - u64 *pte; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount)) - return 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (!pte) - return unmapped; - - *pte = 0ULL; - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -/* - * ---------------------------------------------------- - */ -static void v2_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); - - if (!pgtable || !pgtable->pgd) - return; - - /* Free page table */ - free_pgtable(pgtable->pgd, get_pgtable_level()); - pgtable->pgd = NULL; -} - -static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - int ias = IOMMU_IN_ADDR_BIT_SIZE; - - pgtable->pgd = iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); - if (!pgtable->pgd) - return NULL; - - if (get_pgtable_level() == PAGE_MODE_5_LEVEL) - ias = 57; - - pgtable->pgtbl.ops.map_pages = iommu_v2_map_pages; - pgtable->pgtbl.ops.unmap_pages = iommu_v2_unmap_pages; - pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys; - - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; - cfg->ias = ias; - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - - return &pgtable->pgtbl; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = { - .alloc = v2_alloc_pgtable, - .free = v2_free_pgtable, -}; diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c index 8841c1487f00..843fec8e8a51 100644 --- a/drivers/iommu/io-pgtable.c +++ b/drivers/iommu/io-pgtable.c @@ -28,10 +28,6 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = { #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S [ARM_V7S] = &io_pgtable_arm_v7s_init_fns, #endif -#ifdef CONFIG_AMD_IOMMU - [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns, - [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns, -#endif }; static int check_custom_allocator(enum io_pgtable_fmt fmt, diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 8a823c6f2b4a..7a1516011ccf 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -15,8 +15,6 @@ enum io_pgtable_fmt { ARM_64_LPAE_S2, ARM_V7S, ARM_MALI_LPAE, - AMD_IOMMU_V1, - AMD_IOMMU_V2, APPLE_DART, APPLE_DART2, IO_PGTABLE_NUM_FMTS, -- cgit v1.2.3 From bc5233c0904eb116a4bd94e10cd3666733216063 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:13 -0400 Subject: iommupt: Add a kunit test for the IOMMU implementation This intends to have high coverage of the page table format functions and the IOMMU implementation itself, exercising the various corner cases. The kunit tests can be run in the kunit framework, using commands like: tools/testing/kunit/kunit.py run --build_dir build_kunit_arm64 --arch arm64 --make_options LLVM=-19 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_uml --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_x86_64 --arch x86_64 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_i386 --arch i386 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_i386pae --arch i386 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig --kconfig_add CONFIG_X86_PAE=y There are several interesting corner cases on the 32 bit platforms that need checking. Like the generic tests, these are run on the format's configuration list using kunit "params". This also checks the core iommu parts of the page table code as it enters the logic through a mock iommu_domain. The following are checked: - PT_FEAT_DYNAMIC_TOP properly adds levels one by one - Every page size can be iommu_map()'d, and mapping creates that size - iommu_iova_to_phys() works with every page size - Test converting OA -> non present -> OA when the two OAs overlap and free table levels - Test that unmap stops at holes, unmap doesn't split, and unmap returns the right values for partial unmap requests - Randomly map/unmap. Checks map with random sizes, that map fails when hitting collisions doing nothing, unmap/map with random intersections and full unmap of random sizes. Also checks iommu_iova_to_phys() with random sizes - Check for memory leaks by monitoring NR_SECONDARY_PAGETABLE Reviewed-by: Kevin Tian Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/fmt/iommu_template.h | 1 + drivers/iommu/generic_pt/kunit_iommu.h | 2 + drivers/iommu/generic_pt/kunit_iommu_pt.h | 487 ++++++++++++++++++++++++++ include/linux/irqchip/riscv-imsic.h | 3 +- 4 files changed, 491 insertions(+), 2 deletions(-) create mode 100644 drivers/iommu/generic_pt/kunit_iommu_pt.h (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h index 11e85106ae30..d28e86abdf2e 100644 --- a/drivers/iommu/generic_pt/fmt/iommu_template.h +++ b/drivers/iommu/generic_pt/fmt/iommu_template.h @@ -44,4 +44,5 @@ * which means we are building the kunit modle. */ #include "../kunit_generic_pt.h" +#include "../kunit_iommu_pt.h" #endif diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h index 28ec313f151e..d541235632aa 100644 --- a/drivers/iommu/generic_pt/kunit_iommu.h +++ b/drivers/iommu/generic_pt/kunit_iommu.h @@ -71,6 +71,8 @@ struct kunit_iommu_priv { unsigned int largest_pgsz_lg2; pt_oaddr_t test_oa; pt_vaddr_t safe_pgsize_bitmap; + unsigned long orig_nr_secondary_pagetable; + }; PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain); diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h new file mode 100644 index 000000000000..e8a63c8ea850 --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "kunit_iommu.h" +#include "pt_iter.h" +#include +#include + +static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len); + +struct count_valids { + u64 per_size[PT_VADDR_MAX_LG2]; +}; + +static int __count_valids(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct count_valids *valids = arg; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + pt_descend(&pts, arg, __count_valids); + continue; + } + if (pts.type == PT_ENTRY_OA) { + valids->per_size[pt_entry_oa_lg2sz(&pts)]++; + continue; + } + } + return 0; +} + +/* + * Number of valid table entries. This counts contiguous entries as a single + * valid. + */ +static unsigned int count_valids(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + u64 total = 0; + unsigned int i; + + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) + total += valids.per_size[i]; + return total; +} + +/* Only a single page size is present, count the number of valid entries */ +static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + u64 total = 0; + unsigned int i; + + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) { + if ((1ULL << i) == pgsz) + total = valids.per_size[i]; + else + KUNIT_ASSERT_EQ(test, valids.per_size[i], 0); + } + return total; +} + +static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + size_t ret; + + ret = iommu_unmap(&priv->domain, va, len); + KUNIT_ASSERT_EQ(test, ret, len); +} + +static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2); + pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2); + + for (; pfn != end_pfn; pfn++) { + phys_addr_t res = iommu_iova_to_phys(&priv->domain, + pfn * priv->smallest_pgsz); + + KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa); + if (res != pa) + break; + pa += priv->smallest_pgsz; + } +} + +static void test_increase_level(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_common *common = priv->common; + + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format"); + + if (IS_32BIT) + kunit_skip(test, "Unable to test on 32bit"); + + KUNIT_ASSERT_GT(test, common->max_vasz_lg2, + pt_top_range(common).max_vasz_lg2); + + /* Add every possible level to the max */ + while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) { + struct pt_range top_range = pt_top_range(common); + + if (top_range.va == 0) + do_map(test, top_range.last_va + 1, 0, + priv->smallest_pgsz); + else + do_map(test, top_range.va - priv->smallest_pgsz, 0, + priv->smallest_pgsz); + + KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level, + top_range.top_level + 1); + KUNIT_ASSERT_GE(test, common->max_vasz_lg2, + pt_top_range(common).max_vasz_lg2); + } +} + +static void test_map_simple(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; + unsigned int pgsz_lg2; + pt_vaddr_t cur_va; + + /* Map every reported page size */ + cur_va = range.va + priv->smallest_pgsz * 256; + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); + u64 len = log2_to_int(pgsz_lg2); + + if (!(pgsize_bitmap & len)) + continue; + + cur_va = ALIGN(cur_va, len); + do_map(test, cur_va, paddr, len); + if (len <= SZ_2G) + check_iova(test, cur_va, paddr, len); + cur_va += len; + } + + /* The read interface reports that every page size was created */ + range = pt_top_range(priv->common); + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + if (pgsize_bitmap & (1ULL << pgsz_lg2)) + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1); + else + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0); + } + + /* Unmap works */ + range = pt_top_range(priv->common); + cur_va = range.va + priv->smallest_pgsz * 256; + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + u64 len = log2_to_int(pgsz_lg2); + + if (!(pgsize_bitmap & len)) + continue; + cur_va = ALIGN(cur_va, len); + do_unmap(test, cur_va, len); + cur_va += len; + } + KUNIT_ASSERT_EQ(test, count_valids(test), 0); +} + +/* + * Test to convert a table pointer into an OA by mapping something small, + * unmapping it so as to leave behind a table pointer, then mapping something + * larger that will convert the table into an OA. + */ +static void test_map_table_to_oa(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t limited_pgbitmap = + priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G); + struct pt_range range = pt_top_range(priv->common); + unsigned int pgsz_lg2; + pt_vaddr_t max_pgsize; + pt_vaddr_t cur_va; + + max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1); + KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize); + + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); + u64 len = log2_to_int(pgsz_lg2); + pt_vaddr_t offset; + + if (!(priv->info.pgsize_bitmap & len)) + continue; + if (len > max_pgsize) + break; + + cur_va = ALIGN(range.va + priv->smallest_pgsz * 256, + max_pgsize); + for (offset = 0; offset != max_pgsize; offset += len) + do_map(test, cur_va + offset, paddr + offset, len); + check_iova(test, cur_va, paddr, max_pgsize); + KUNIT_ASSERT_EQ(test, count_valids_single(test, len), + log2_div(max_pgsize, pgsz_lg2)); + + if (len == max_pgsize) { + do_unmap(test, cur_va, max_pgsize); + } else { + do_unmap(test, cur_va, max_pgsize / 2); + for (offset = max_pgsize / 2; offset != max_pgsize; + offset += len) + do_unmap(test, cur_va + offset, len); + } + + KUNIT_ASSERT_EQ(test, count_valids(test), 0); + } +} + +/* + * Test unmapping a small page at the start of a large page. This always unmaps + * the large page. + */ +static void test_unmap_split(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; + unsigned int pgsz_lg2; + unsigned int count = 0; + + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_vaddr_t base_len = log2_to_int(pgsz_lg2); + unsigned int next_pgsz_lg2; + + if (!(pgsize_bitmap & base_len)) + continue; + + for (next_pgsz_lg2 = pgsz_lg2 + 1; + next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) { + pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2); + pt_vaddr_t vaddr = top_range.va; + pt_oaddr_t paddr = 0; + size_t gnmapped; + + if (!(pgsize_bitmap & next_len)) + continue; + + do_map(test, vaddr, paddr, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + + /* Make sure unmap doesn't keep going */ + do_map(test, vaddr, paddr, next_len); + do_map(test, vaddr + next_len, paddr, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr + next_len, + next_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + + count++; + } + } + + if (count == 0) + kunit_skip(test, "Test needs two page sizes"); +} + +static void unmap_collisions(struct kunit *test, struct maple_tree *mt, + pt_vaddr_t start, pt_vaddr_t last) +{ + struct kunit_iommu_priv *priv = test->priv; + MA_STATE(mas, mt, start, last); + void *entry; + + mtree_lock(mt); + mas_for_each(&mas, entry, last) { + pt_vaddr_t mas_start = mas.index; + pt_vaddr_t len = (mas.last - mas_start) + 1; + pt_oaddr_t paddr; + + mas_erase(&mas); + mas_pause(&mas); + mtree_unlock(mt); + + paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2); + check_iova(test, mas_start, paddr, len); + do_unmap(test, mas_start, len); + mtree_lock(mt); + } + mtree_unlock(mt); +} + +static void clamp_range(struct kunit *test, struct pt_range *range) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (range->last_va - range->va > SZ_1G) + range->last_va = range->va + SZ_1G; + KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX); + if (range->va <= MAPLE_RESERVED_RANGE) + range->va = + ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz); +} + +/* + * Randomly map and unmap ranges that can large physical pages. If a random + * range overlaps with existing ranges then unmap them. This hits all the + * special cases. + */ +static void test_random_map(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range upper_range = pt_upper_range(priv->common); + struct pt_range top_range = pt_top_range(priv->common); + struct maple_tree mt; + unsigned int iter; + + mt_init(&mt); + + /* + * Shrink the range so randomization is more likely to have + * intersections + */ + clamp_range(test, &top_range); + clamp_range(test, &upper_range); + + for (iter = 0; iter != 1000; iter++) { + struct pt_range *range = &top_range; + pt_oaddr_t paddr; + pt_vaddr_t start; + pt_vaddr_t end; + int ret; + + if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) && + ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1)) + range = &upper_range; + + start = get_random_u32_below( + min(U32_MAX, range->last_va - range->va)); + end = get_random_u32_below( + min(U32_MAX, range->last_va - start)); + + start = ALIGN_DOWN(start, priv->smallest_pgsz); + end = ALIGN(end, priv->smallest_pgsz); + start += range->va; + end += start; + if (start < range->va || end > range->last_va + 1 || + start >= end) + continue; + + /* Try overmapping to test the failure handling */ + paddr = oalog2_mod(start, priv->common->max_oasz_lg2); + ret = iommu_map(&priv->domain, start, paddr, end - start, + IOMMU_READ | IOMMU_WRITE, GFP_KERNEL); + if (ret) { + KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE); + unmap_collisions(test, &mt, start, end - 1); + do_map(test, start, paddr, end - start); + } + + KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range", + mtree_insert_range(&mt, start, end - 1, + XA_ZERO_ENTRY, + GFP_KERNEL)); + + check_iova(test, start, paddr, end - start); + if (iter % 100) + cond_resched(); + } + + unmap_collisions(test, &mt, 0, PT_VADDR_MAX); + KUNIT_ASSERT_EQ(test, count_valids(test), 0); + + mtree_destroy(&mt); +} + +/* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */ +static void test_pgsize_boundary(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + + if (top_range.va != 0 || top_range.last_va < 0xfef9ffff || + priv->smallest_pgsz != SZ_4K) + kunit_skip(test, "Format does not have the required range"); + + do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1); +} + +/* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */ +static void test_mixed(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + u64 start = 0x3fe400ULL << 12; + u64 end = 0x4c0600ULL << 12; + pt_vaddr_t len = end - start; + pt_oaddr_t oa = start; + + if (top_range.last_va <= start || sizeof(unsigned long) == 4) + kunit_skip(test, "range is too small"); + if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21))) + kunit_skip(test, "incompatible psize"); + + do_map(test, start, oa, len); + /* 14 2M, 3 1G, 3 2M */ + KUNIT_ASSERT_EQ(test, count_valids(test), 20); + check_iova(test, start, oa, len); +} + +static struct kunit_case iommu_test_cases[] = { + KUNIT_CASE_FMT(test_increase_level), + KUNIT_CASE_FMT(test_map_simple), + KUNIT_CASE_FMT(test_map_table_to_oa), + KUNIT_CASE_FMT(test_unmap_split), + KUNIT_CASE_FMT(test_random_map), + KUNIT_CASE_FMT(test_pgsize_boundary), + KUNIT_CASE_FMT(test_mixed), + {}, +}; + +static int pt_kunit_iommu_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv; + int ret; + + priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->orig_nr_secondary_pagetable = + global_node_page_state(NR_SECONDARY_PAGETABLE); + ret = pt_kunit_priv_init(test, priv); + if (ret) { + kunit_kfree(test, priv); + return ret; + } + test->priv = priv; + return 0; +} + +static void pt_kunit_iommu_exit(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!test->priv) + return; + + pt_iommu_deinit(priv->iommu); + /* + * Look for memory leaks, assumes kunit is running isolated and nothing + * else is using secondary page tables. + */ + KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable, + global_node_page_state(NR_SECONDARY_PAGETABLE)); + kunit_kfree(test, test->priv); +} + +static struct kunit_suite NS(iommu_suite) = { + .name = __stringify(NS(iommu_test)), + .init = pt_kunit_iommu_init, + .exit = pt_kunit_iommu_exit, + .test_cases = iommu_test_cases, +}; +kunit_test_suites(&NS(iommu_suite)); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Kunit for generic page table"); +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h index 7494952c5518..7f3ff5c5ea53 100644 --- a/include/linux/irqchip/riscv-imsic.h +++ b/include/linux/irqchip/riscv-imsic.h @@ -10,7 +10,6 @@ #include #include #include -#include #define IMSIC_MMIO_PAGE_SHIFT 12 #define IMSIC_MMIO_PAGE_SZ BIT(IMSIC_MMIO_PAGE_SHIFT) @@ -86,7 +85,7 @@ static inline const struct imsic_global_config *imsic_get_global_config(void) #endif -#ifdef CONFIG_ACPI +#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_RISCV_IMSIC) int imsic_platform_acpi_probe(struct fwnode_handle *fwnode); struct fwnode_handle *imsic_acpi_get_fwnode(struct device *dev); #else -- cgit v1.2.3 From aefd967dab6469f5b827b59e50016a760dcc1fbc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 Oct 2025 15:22:31 -0300 Subject: iommupt: Use the incoherent start/stop functions for PT_FEAT_DMA_INCOHERENT This is the first step to supporting an incoherent walker, start and stop the incoherence around the allocation and frees of the page table memory. The iommu_pages API maps this to dma_map/unmap_single(), or arch cache flushing calls. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 89 ++++++++++++++++++++++++++-------- drivers/iommu/generic_pt/kunit_iommu.h | 1 + drivers/iommu/generic_pt/pt_defs.h | 5 +- include/linux/generic_pt/common.h | 6 +++ include/linux/generic_pt/iommu.h | 7 +++ 5 files changed, 88 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 142001f5aa83..2cad07da995a 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -24,6 +24,10 @@ static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, { struct pt_common *common = common_from_iommu(iommu_table); + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(free_list, + iommu_table->iommu_device); + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); @@ -329,35 +333,55 @@ static int __collect_tables(struct pt_range *range, void *arg, return 0; } -static inline struct pt_table_p *table_alloc_top(struct pt_common *common, - uintptr_t top_of_table, - gfp_t gfp) +enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH}; + +/* Allocate a table, the empty table will be ready to be installed. */ +static inline struct pt_table_p *_table_alloc(struct pt_common *common, + size_t lg2sz, gfp_t gfp, + enum alloc_mode mode) { struct pt_iommu *iommu_table = iommu_from_common(common); + struct pt_table_p *table_mem; + + table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp, + log2_to_int(lg2sz)); + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && + mode == ALLOC_NORMAL) { + int ret = iommu_pages_start_incoherent( + table_mem, iommu_table->iommu_device); + if (ret) { + iommu_free_pages(table_mem); + return ERR_PTR(ret); + } + } + return table_mem; +} +static inline struct pt_table_p *table_alloc_top(struct pt_common *common, + uintptr_t top_of_table, + gfp_t gfp, + enum alloc_mode mode) +{ /* * Top doesn't need the free list or otherwise, so it technically * doesn't need to use iommu pages. Use the API anyhow as the top is * usually not smaller than PAGE_SIZE to keep things simple. */ - return iommu_alloc_pages_node_sz( - iommu_table->nid, gfp, - log2_to_int(pt_top_memsize_lg2(common, top_of_table))); + return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table), + gfp, mode); } /* Allocate an interior table */ static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts, - gfp_t gfp) + gfp_t gfp, enum alloc_mode mode) { - struct pt_iommu *iommu_table = - iommu_from_common(parent_pts->range->common); struct pt_state child_pts = pt_init(parent_pts->range, parent_pts->level - 1, NULL); - return iommu_alloc_pages_node_sz( - iommu_table->nid, gfp, - log2_to_int(pt_num_items_lg2(&child_pts) + - ilog2(PT_ITEM_WORD_SIZE))); + return _table_alloc(parent_pts->range->common, + pt_num_items_lg2(&child_pts) + + ilog2(PT_ITEM_WORD_SIZE), + gfp, mode); } static inline int pt_iommu_new_table(struct pt_state *pts, @@ -370,13 +394,15 @@ static inline int pt_iommu_new_table(struct pt_state *pts, if (PT_WARN_ON(!pt_can_have_table(pts))) return -ENXIO; - table_mem = table_alloc(pts, attrs->gfp); + table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL); if (IS_ERR(table_mem)) return PTR_ERR(table_mem); phys = virt_to_phys(table_mem); if (!pt_install_table(pts, phys, attrs)) { - iommu_free_pages(table_mem); + iommu_pages_free_incoherent( + table_mem, + iommu_from_common(pts->range->common)->iommu_device); return -EAGAIN; } @@ -389,7 +415,9 @@ static inline int pt_iommu_new_table(struct pt_state *pts, pt_load_single_entry(pts); if (PT_WARN_ON(pt_table_pa(pts) != phys)) { pt_clear_entries(pts, ilog2(1)); - iommu_free_pages(table_mem); + iommu_pages_free_incoherent( + table_mem, iommu_from_common(pts->range->common) + ->iommu_device); return -EINVAL; } } @@ -615,8 +643,9 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, } new_level = pts.level; - table_mem = table_alloc_top( - common, _pt_top_set(NULL, pts.level), map->attrs.gfp); + table_mem = + table_alloc_top(common, _pt_top_set(NULL, pts.level), + map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH); if (IS_ERR(table_mem)) return PTR_ERR(table_mem); iommu_pages_list_add(&free_list, table_mem); @@ -633,6 +662,16 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, new_top_of_table = _pt_top_set(pts.table, pts.level); } + /* + * Avoid double flushing, flush it once after all pt_install_table() + */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { + ret = iommu_pages_start_incoherent_list( + &free_list, iommu_table->iommu_device); + if (ret) + goto err_free; + } + /* * top_of_table is write locked by the spinlock, but readers can use * READ_ONCE() to get the value. Since we encode both the level and the @@ -665,6 +704,9 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, return 0; err_free: + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(&free_list, + iommu_table->iommu_device); iommu_put_pages_list(&free_list); return ret; } @@ -988,6 +1030,9 @@ static void NS(deinit)(struct pt_iommu *iommu_table) * The driver has to already have fenced the HW access to the page table * and invalidated any caching referring to this memory. */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(&collect.free_list, + iommu_table->iommu_device); iommu_put_pages_list(&collect.free_list); } @@ -1078,6 +1123,7 @@ static void pt_iommu_zero(struct pt_iommu_table *fmt_table) memset_after(fmt_table, 0, iommu.domain); /* The caller can initialize some of these values */ + iommu_table->iommu_device = cfg.iommu_device; iommu_table->driver_ops = cfg.driver_ops; iommu_table->nid = cfg.nid; } @@ -1123,11 +1169,16 @@ int pt_iommu_init(struct pt_iommu_table *fmt_table, pt_feature(common, PT_FEAT_DYNAMIC_TOP))) return -EINVAL; + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && + WARN_ON(!iommu_table->iommu_device)) + return -EINVAL; + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain); if (ret) return ret; - table_mem = table_alloc_top(common, common->top_of_table, gfp); + table_mem = table_alloc_top(common, common->top_of_table, gfp, + ALLOC_NORMAL); if (IS_ERR(table_mem)) return PTR_ERR(table_mem); pt_top_set(common, table_mem, pt_top_get_level(common)); diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h index d541235632aa..5d4f269627d5 100644 --- a/drivers/iommu/generic_pt/kunit_iommu.h +++ b/drivers/iommu/generic_pt/kunit_iommu.h @@ -139,6 +139,7 @@ static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv) priv->fmt_table.iommu.nid = NUMA_NO_NODE; priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops; + priv->fmt_table.iommu.iommu_device = priv->dummy_dev; priv->domain.ops = &kunit_pt_ops; ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL); if (ret) { diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h index 819057de50d8..c25544d72f97 100644 --- a/drivers/iommu/generic_pt/pt_defs.h +++ b/drivers/iommu/generic_pt/pt_defs.h @@ -48,13 +48,16 @@ enum { /* * When in debug mode we compile all formats with all features. This allows the * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or - * FULL_VA. + * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have */ #if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) enum { PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES, PT_DEBUG_SUPPORTED_FEATURES = UINT_MAX & + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ? + 0 : + BIT(PT_FEAT_DMA_INCOHERENT))) & ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ? BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) : BIT(PT_FEAT_SIGN_EXTEND)), diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 96f8a6a7d60e..883069e32952 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -85,6 +85,12 @@ enum { * position. */ enum pt_features { + /** + * @PT_FEAT_DMA_INCOHERENT: Cache flush page table memory before + * assuming the HW can read it. Otherwise a SMP release is sufficient + * for HW to read it. + */ + PT_FEAT_DMA_INCOHERENT, /** * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to * PT_VADDR_MAX. diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index fde7ccf007c5..21132e342a79 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -57,6 +57,13 @@ struct pt_iommu { * table walkers. */ int nid; + + /** + * @iommu_device: Device pointer used for any DMA cache flushing when + * PT_FEAT_DMA_INCOHERENT. This is the iommu device that created the + * page table which must have dma ops that perform cache flushing. + */ + struct device *iommu_device; }; /** -- cgit v1.2.3 From 5448c1558f60d4051c90938f2878c6fb20e2982a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 Oct 2025 15:22:33 -0300 Subject: iommupt: Add the Intel VT-d second stage page table format The VT-d second stage format is almost the same as the x86 PAE format, except the bit encodings in the PTE are different and a few new PTE features, like force coherency are present. Among all the formats it is unique in not having a designated present bit. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 53,66 , 50,64 , 21.21 2^21, 59,70 , 56,67 , 16.16 2^30, 54,66 , 52,63 , 17.17 256*2^12, 384,524 , 337,516 , 34.34 256*2^21, 387,632 , 336,626 , 46.46 256*2^30, 376,629 , 323,623 , 48.48 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 67,86 , 63,84 , 25.25 2^21, 64,84 , 59,80 , 26.26 2^30, 59,78 , 56,74 , 24.24 256*2^12, 216,335 , 198,317 , 37.37 256*2^21, 245,350 , 232,344 , 32.32 256*2^30, 248,345 , 226,339 , 33.33 Cc: Tina Zhang Cc: Kevin Tian Cc: Lu Baolu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/.kunitconfig | 1 + drivers/iommu/generic_pt/Kconfig | 11 ++ drivers/iommu/generic_pt/fmt/Makefile | 2 + drivers/iommu/generic_pt/fmt/defs_vtdss.h | 21 +++ drivers/iommu/generic_pt/fmt/iommu_vtdss.c | 10 + drivers/iommu/generic_pt/fmt/vtdss.h | 292 +++++++++++++++++++++++++++++ include/linux/generic_pt/common.h | 18 ++ include/linux/generic_pt/iommu.h | 11 ++ 8 files changed, 366 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/defs_vtdss.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_vtdss.c create mode 100644 drivers/iommu/generic_pt/fmt/vtdss.h (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig index 2016c5e5ac0f..52ac9e661ffd 100644 --- a/drivers/iommu/generic_pt/.kunitconfig +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -3,6 +3,7 @@ CONFIG_GENERIC_PT=y CONFIG_DEBUG_GENERIC_PT=y CONFIG_IOMMU_PT=y CONFIG_IOMMU_PT_AMDV1=y +CONFIG_IOMMU_PT_VTDSS=y CONFIG_IOMMU_PT_X86_64=y CONFIG_IOMMU_PT_KUNIT_TEST=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index 6dcb771b3c58..79f65268f312 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -42,6 +42,16 @@ config IOMMU_PT_AMDV1 Selected automatically by an IOMMU driver that uses this format. +config IOMMU_PT_VTDSS + tristate "IOMMU page table for Intel VT-d Second Stage" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5 + level Second Stage page table. It is similar to the X86_64 format with + 4K/2M/1G page sizes. + + Selected automatically by an IOMMU driver that uses this format. + config IOMMU_PT_X86_64 tristate "IOMMU page table for x86 64-bit, 4/5 levels" depends on !GENERIC_ATOMIC64 # for cmpxchg64 @@ -57,6 +67,7 @@ config IOMMU_PT_KUNIT_TEST depends on KUNIT depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 + depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS default KUNIT_ALL_TESTS help Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile index 5a3379107999..976b49ec97dc 100644 --- a/drivers/iommu/generic_pt/fmt/Makefile +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -3,6 +3,8 @@ iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock +iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss + iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 IOMMU_PT_KUNIT_TEST := diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h new file mode 100644 index 000000000000..4a239bcaae2a --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H +#define __GENERIC_PT_FMT_DEFS_VTDSS_H + +#include +#include + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct vtdss_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs vtdss_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c new file mode 100644 index 000000000000..f551711e2a33 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT vtdss +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \ + BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h new file mode 100644 index 000000000000..d9774848eb6f --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/vtdss.h @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + * Intel VT-d Second Stange 5/4 level page table + * + * This is described in + * Section "3.7 Second-Stage Translation" + * Section "9.8 Second-Stage Paging Entries" + * + * Of the "Intel Virtualization Technology for Directed I/O Architecture + * Specification". + * + * The named levels in the spec map to the pts->level as: + * Table/SS-PTE - 0 + * Directory/SS-PDE - 1 + * Directory Ptr/SS-PDPTE - 2 + * PML4/SS-PML4E - 3 + * PML5/SS-PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_VTDSS_H +#define __GENERIC_PT_FMT_VTDSS_H + +#include "defs_vtdss.h" +#include "../pt_defs.h" + +#include +#include +#include + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* SSPTPTR is 4k aligned and limited by HAW */ + PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12), +}; + +/* Shared descriptor bits */ +enum { + VTDSS_FMT_R = BIT(0), + VTDSS_FMT_W = BIT(1), + VTDSS_FMT_A = BIT(8), + VTDSS_FMT_D = BIT(9), + VTDSS_FMT_SNP = BIT(11), + VTDSS_FMT_OA = GENMASK_ULL(51, 12), +}; + +/* PDPTE/PDE */ +enum { + VTDSS_FMT_PS = BIT(7), +}; + +#define common_to_vtdss_pt(common_ptr) \ + container_of_const(common_ptr, struct pt_vtdss, common) +#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common) + +static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa vtdss_pt_table_pa + +static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa vtdss_pt_entry_oa + +static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf vtdss_pt_can_have_leaf + +static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 vtdss_pt_num_items_lg2 + +static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!entry) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw vtdss_pt_load_entry_raw + +static inline void +vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= VTDSS_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry vtdss_pt_install_leaf_entry + +static inline bool vtdss_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = VTDSS_FMT_R | VTDSS_FMT_W | + FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + return pt_table_install64(pts, entry); +} +#define pt_install_table vtdss_pt_install_table + +static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP); +} +#define pt_attr_from_entry vtdss_pt_attr_from_entry + +static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + return READ_ONCE(*tablep) & VTDSS_FMT_D; +} +#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty + +static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D); +} +#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean + +static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | VTDSS_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty + +static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common) +{ + return 10; +} +#define pt_max_sw_bit vtdss_pt_max_sw_bit + +static inline u64 vtdss_pt_sw_bit(unsigned int bitnr) +{ + /* Bits marked Ignored in the specification */ + switch (bitnr) { + case 0: + return BIT(10); + case 1 ... 9: + return BIT_ULL((bitnr - 1) + 52); + case 10: + return BIT_ULL(63); + /* Some bits in 9-3 are available in some entries */ + default: + if (__builtin_constant_p(bitnr)) + BUILD_BUG(); + else + PT_WARN_ON(true); + return 0; + } +} +#define pt_sw_bit vtdss_pt_sw_bit + +/* --- iommu */ +#include +#include + +#define pt_iommu_table pt_iommu_vtdss + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->vtdss_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, vtdss_pt.common) + ->iommu; +} + +static inline int vtdss_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + /* + * VTDSS does not have a present bit, so we tell if any entry is present + * by checking for R or W. + */ + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) + return -EINVAL; + + if (iommu_prot & IOMMU_READ) + pte |= VTDSS_FMT_R; + if (iommu_prot & IOMMU_WRITE) + pte |= VTDSS_FMT_W; + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE)) + pte |= VTDSS_FMT_SNP; + + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) && + !(iommu_prot & IOMMU_WRITE)) { + pr_err_ratelimited( + "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot vtdss_pt_iommu_set_prot + +static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table, + const struct pt_iommu_vtdss_cfg *cfg) +{ + struct pt_vtdss *table = &iommu_table->vtdss_pt; + unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2; + + if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2) + return -EOPNOTSUPP; + else if (vasz_lg2 > 48) + pt_top_set_level(&table->common, 4); + else if (vasz_lg2 > 39) + pt_top_set_level(&table->common, 3); + else if (vasz_lg2 > 30) + pt_top_set_level(&table->common, 2); + else + return -EOPNOTSUPP; + return 0; +} +#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init + +static inline void +vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table, + const struct pt_range *top_range, + struct pt_iommu_vtdss_hw_info *info) +{ + info->ssptptr = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK); + /* + * top_level = 2 = 3 level table aw=1 + * top_level = 3 = 4 level table aw=2 + * top_level = 4 = 5 level table aw=3 + */ + info->aw = top_range->top_level - 1; +} +#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = { + [0] = { .common.hw_max_vasz_lg2 = 39 }, + [1] = { .common.hw_max_vasz_lg2 = 48 }, + [2] = { .common.hw_max_vasz_lg2 = 57 }, +}; +#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) }; +#endif +#endif diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 883069e32952..6a9a1acb5aad 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -157,6 +157,24 @@ enum { PT_FEAT_AMDV1_FORCE_COHERENCE, }; +struct pt_vtdss { + struct pt_common common; +}; + +enum { + /* + * The PTEs are set to prevent cache incoherent traffic, such as PCI no + * snoop. This is set either at creation time or before the first map + * operation. + */ + PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START, + /* + * Prevent creating read-only PTEs. Used to work around HW errata + * ERRATA_772415_SPR17. + */ + PT_FEAT_VTDSS_FORCE_WRITEABLE, +}; + struct pt_x86_64 { struct pt_common common; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 21132e342a79..cfe05a77f86b 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -262,6 +262,17 @@ IOMMU_FORMAT(amdv1, amdpt); struct pt_iommu_amdv1_mock_hw_info; IOMMU_PROTOTYPES(amdv1_mock); +struct pt_iommu_vtdss_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_vtdss_hw_info { + u64 ssptptr; + u8 aw; +}; + +IOMMU_FORMAT(vtdss, vtdss_pt); + struct pt_iommu_x86_64_cfg { struct pt_iommu_cfg common; }; -- cgit v1.2.3 From 0485a18d9141775d54489997b284fe2557b5898e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Nov 2025 15:46:32 +0100 Subject: fs: rename fs_types.h to fs_dirent.h We will split out a bunch of types into a separate header. So free up the appropriate name for it. Link: https://patch.msgid.link/20251104-work-fs-header-v1-1-fb39a2efe39e@kernel.org Signed-off-by: Christian Brauner --- fs/Makefile | 2 +- fs/fs_dirent.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++ fs/fs_types.c | 105 ---------------------------------------------- include/linux/fs.h | 2 +- include/linux/fs_dirent.h | 78 ++++++++++++++++++++++++++++++++++ include/linux/fs_types.h | 75 --------------------------------- 6 files changed, 185 insertions(+), 182 deletions(-) create mode 100644 fs/fs_dirent.c delete mode 100644 fs/fs_types.c create mode 100644 include/linux/fs_dirent.h delete mode 100644 include/linux/fs_types.h (limited to 'include/linux') diff --git a/fs/Makefile b/fs/Makefile index e3523ab2e587..a04274a3c854 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,7 +14,7 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ + fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ file_attr.o diff --git a/fs/fs_dirent.c b/fs/fs_dirent.c new file mode 100644 index 000000000000..e5e08f213816 --- /dev/null +++ b/fs/fs_dirent.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +/* + * fs on-disk file type to dirent file type conversion + */ +static const unsigned char fs_dtype_by_ftype[FT_MAX] = { + [FT_UNKNOWN] = DT_UNKNOWN, + [FT_REG_FILE] = DT_REG, + [FT_DIR] = DT_DIR, + [FT_CHRDEV] = DT_CHR, + [FT_BLKDEV] = DT_BLK, + [FT_FIFO] = DT_FIFO, + [FT_SOCK] = DT_SOCK, + [FT_SYMLINK] = DT_LNK +}; + +/** + * fs_ftype_to_dtype() - fs on-disk file type to dirent type. + * @filetype: The on-disk file type to convert. + * + * This function converts the on-disk file type value (FT_*) to the directory + * entry type (DT_*). + * + * Context: Any context. + * Return: + * * DT_UNKNOWN - Unknown type + * * DT_FIFO - FIFO + * * DT_CHR - Character device + * * DT_DIR - Directory + * * DT_BLK - Block device + * * DT_REG - Regular file + * * DT_LNK - Symbolic link + * * DT_SOCK - Local-domain socket + */ +unsigned char fs_ftype_to_dtype(unsigned int filetype) +{ + if (filetype >= FT_MAX) + return DT_UNKNOWN; + + return fs_dtype_by_ftype[filetype]; +} +EXPORT_SYMBOL_GPL(fs_ftype_to_dtype); + +/* + * dirent file type to fs on-disk file type conversion + * Values not initialized explicitly are FT_UNKNOWN (0). + */ +static const unsigned char fs_ftype_by_dtype[DT_MAX] = { + [DT_REG] = FT_REG_FILE, + [DT_DIR] = FT_DIR, + [DT_LNK] = FT_SYMLINK, + [DT_CHR] = FT_CHRDEV, + [DT_BLK] = FT_BLKDEV, + [DT_FIFO] = FT_FIFO, + [DT_SOCK] = FT_SOCK, +}; + +/** + * fs_umode_to_ftype() - file mode to on-disk file type. + * @mode: The file mode to convert. + * + * This function converts the file mode value to the on-disk file type (FT_*). + * + * Context: Any context. + * Return: + * * FT_UNKNOWN - Unknown type + * * FT_REG_FILE - Regular file + * * FT_DIR - Directory + * * FT_CHRDEV - Character device + * * FT_BLKDEV - Block device + * * FT_FIFO - FIFO + * * FT_SOCK - Local-domain socket + * * FT_SYMLINK - Symbolic link + */ +unsigned char fs_umode_to_ftype(umode_t mode) +{ + return fs_ftype_by_dtype[S_DT(mode)]; +} +EXPORT_SYMBOL_GPL(fs_umode_to_ftype); + +/** + * fs_umode_to_dtype() - file mode to dirent file type. + * @mode: The file mode to convert. + * + * This function converts the file mode value to the directory + * entry type (DT_*). + * + * Context: Any context. + * Return: + * * DT_UNKNOWN - Unknown type + * * DT_FIFO - FIFO + * * DT_CHR - Character device + * * DT_DIR - Directory + * * DT_BLK - Block device + * * DT_REG - Regular file + * * DT_LNK - Symbolic link + * * DT_SOCK - Local-domain socket + */ +unsigned char fs_umode_to_dtype(umode_t mode) +{ + return fs_ftype_to_dtype(fs_umode_to_ftype(mode)); +} +EXPORT_SYMBOL_GPL(fs_umode_to_dtype); diff --git a/fs/fs_types.c b/fs/fs_types.c deleted file mode 100644 index 78365e5dc08c..000000000000 --- a/fs/fs_types.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include - -/* - * fs on-disk file type to dirent file type conversion - */ -static const unsigned char fs_dtype_by_ftype[FT_MAX] = { - [FT_UNKNOWN] = DT_UNKNOWN, - [FT_REG_FILE] = DT_REG, - [FT_DIR] = DT_DIR, - [FT_CHRDEV] = DT_CHR, - [FT_BLKDEV] = DT_BLK, - [FT_FIFO] = DT_FIFO, - [FT_SOCK] = DT_SOCK, - [FT_SYMLINK] = DT_LNK -}; - -/** - * fs_ftype_to_dtype() - fs on-disk file type to dirent type. - * @filetype: The on-disk file type to convert. - * - * This function converts the on-disk file type value (FT_*) to the directory - * entry type (DT_*). - * - * Context: Any context. - * Return: - * * DT_UNKNOWN - Unknown type - * * DT_FIFO - FIFO - * * DT_CHR - Character device - * * DT_DIR - Directory - * * DT_BLK - Block device - * * DT_REG - Regular file - * * DT_LNK - Symbolic link - * * DT_SOCK - Local-domain socket - */ -unsigned char fs_ftype_to_dtype(unsigned int filetype) -{ - if (filetype >= FT_MAX) - return DT_UNKNOWN; - - return fs_dtype_by_ftype[filetype]; -} -EXPORT_SYMBOL_GPL(fs_ftype_to_dtype); - -/* - * dirent file type to fs on-disk file type conversion - * Values not initialized explicitly are FT_UNKNOWN (0). - */ -static const unsigned char fs_ftype_by_dtype[DT_MAX] = { - [DT_REG] = FT_REG_FILE, - [DT_DIR] = FT_DIR, - [DT_LNK] = FT_SYMLINK, - [DT_CHR] = FT_CHRDEV, - [DT_BLK] = FT_BLKDEV, - [DT_FIFO] = FT_FIFO, - [DT_SOCK] = FT_SOCK, -}; - -/** - * fs_umode_to_ftype() - file mode to on-disk file type. - * @mode: The file mode to convert. - * - * This function converts the file mode value to the on-disk file type (FT_*). - * - * Context: Any context. - * Return: - * * FT_UNKNOWN - Unknown type - * * FT_REG_FILE - Regular file - * * FT_DIR - Directory - * * FT_CHRDEV - Character device - * * FT_BLKDEV - Block device - * * FT_FIFO - FIFO - * * FT_SOCK - Local-domain socket - * * FT_SYMLINK - Symbolic link - */ -unsigned char fs_umode_to_ftype(umode_t mode) -{ - return fs_ftype_by_dtype[S_DT(mode)]; -} -EXPORT_SYMBOL_GPL(fs_umode_to_ftype); - -/** - * fs_umode_to_dtype() - file mode to dirent file type. - * @mode: The file mode to convert. - * - * This function converts the file mode value to the directory - * entry type (DT_*). - * - * Context: Any context. - * Return: - * * DT_UNKNOWN - Unknown type - * * DT_FIFO - FIFO - * * DT_CHR - Character device - * * DT_DIR - Directory - * * DT_BLK - Block device - * * DT_REG - Regular file - * * DT_LNK - Symbolic link - * * DT_SOCK - Local-domain socket - */ -unsigned char fs_umode_to_dtype(umode_t mode) -{ - return fs_ftype_to_dtype(fs_umode_to_ftype(mode)); -} -EXPORT_SYMBOL_GPL(fs_umode_to_dtype); diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..3c971ddace41 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/fs_dirent.h b/include/linux/fs_dirent.h new file mode 100644 index 000000000000..92f75c5bac19 --- /dev/null +++ b/include/linux/fs_dirent.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FS_DIRENT_H +#define _LINUX_FS_DIRENT_H + +#include +#include + +/* + * This is a header for the common implementation of dirent + * to fs on-disk file type conversion. Although the fs on-disk + * bits are specific to every file system, in practice, many + * file systems use the exact same on-disk format to describe + * the lower 3 file type bits that represent the 7 POSIX file + * types. + * + * It is important to note that the definitions in this + * header MUST NOT change. This would break both the + * userspace ABI and the on-disk format of filesystems + * using this code. + * + * All those file systems can use this generic code for the + * conversions. + */ + +/* + * struct dirent file types + * exposed to user via getdents(2), readdir(3) + * + * These match bits 12..15 of stat.st_mode + * (ie "(i_mode >> 12) & 15"). + */ +#define S_DT_SHIFT 12 +#define S_DT(mode) (((mode) & S_IFMT) >> S_DT_SHIFT) +#define S_DT_MASK (S_IFMT >> S_DT_SHIFT) + +/* these are defined by POSIX and also present in glibc's dirent.h */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 +#define DT_WHT 14 + +#define DT_MAX (S_DT_MASK + 1) /* 16 */ + +/* + * fs on-disk file types. + * Only the low 3 bits are used for the POSIX file types. + * Other bits are reserved for fs private use. + * These definitions are shared and used by multiple filesystems, + * and MUST NOT change under any circumstances. + * + * Note that no fs currently stores the whiteout type on-disk, + * so whiteout dirents are exposed to user as DT_CHR. + */ +#define FT_UNKNOWN 0 +#define FT_REG_FILE 1 +#define FT_DIR 2 +#define FT_CHRDEV 3 +#define FT_BLKDEV 4 +#define FT_FIFO 5 +#define FT_SOCK 6 +#define FT_SYMLINK 7 + +#define FT_MAX 8 + +/* + * declarations for helper functions, accompanying implementation + * is in fs/fs_dirent.c + */ +extern unsigned char fs_ftype_to_dtype(unsigned int filetype); +extern unsigned char fs_umode_to_ftype(umode_t mode); +extern unsigned char fs_umode_to_dtype(umode_t mode); + +#endif /* _LINUX_FS_DIRENT_H */ diff --git a/include/linux/fs_types.h b/include/linux/fs_types.h deleted file mode 100644 index 54816791196f..000000000000 --- a/include/linux/fs_types.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_FS_TYPES_H -#define _LINUX_FS_TYPES_H - -/* - * This is a header for the common implementation of dirent - * to fs on-disk file type conversion. Although the fs on-disk - * bits are specific to every file system, in practice, many - * file systems use the exact same on-disk format to describe - * the lower 3 file type bits that represent the 7 POSIX file - * types. - * - * It is important to note that the definitions in this - * header MUST NOT change. This would break both the - * userspace ABI and the on-disk format of filesystems - * using this code. - * - * All those file systems can use this generic code for the - * conversions. - */ - -/* - * struct dirent file types - * exposed to user via getdents(2), readdir(3) - * - * These match bits 12..15 of stat.st_mode - * (ie "(i_mode >> 12) & 15"). - */ -#define S_DT_SHIFT 12 -#define S_DT(mode) (((mode) & S_IFMT) >> S_DT_SHIFT) -#define S_DT_MASK (S_IFMT >> S_DT_SHIFT) - -/* these are defined by POSIX and also present in glibc's dirent.h */ -#define DT_UNKNOWN 0 -#define DT_FIFO 1 -#define DT_CHR 2 -#define DT_DIR 4 -#define DT_BLK 6 -#define DT_REG 8 -#define DT_LNK 10 -#define DT_SOCK 12 -#define DT_WHT 14 - -#define DT_MAX (S_DT_MASK + 1) /* 16 */ - -/* - * fs on-disk file types. - * Only the low 3 bits are used for the POSIX file types. - * Other bits are reserved for fs private use. - * These definitions are shared and used by multiple filesystems, - * and MUST NOT change under any circumstances. - * - * Note that no fs currently stores the whiteout type on-disk, - * so whiteout dirents are exposed to user as DT_CHR. - */ -#define FT_UNKNOWN 0 -#define FT_REG_FILE 1 -#define FT_DIR 2 -#define FT_CHRDEV 3 -#define FT_BLKDEV 4 -#define FT_FIFO 5 -#define FT_SOCK 6 -#define FT_SYMLINK 7 - -#define FT_MAX 8 - -/* - * declarations for helper functions, accompanying implementation - * is in fs/fs_types.c - */ -extern unsigned char fs_ftype_to_dtype(unsigned int filetype); -extern unsigned char fs_umode_to_ftype(umode_t mode); -extern unsigned char fs_umode_to_dtype(umode_t mode); - -#endif -- cgit v1.2.3 From b2f35ac4146d32d4424aaa941bbc681f12c1b9e6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:04 -0700 Subject: iomap: add caller-provided callbacks for read and readahead Add caller-provided callbacks for read and readahead so that it can be used generically, especially by filesystems that are not block-based. In particular, this: * Modifies the read and readahead interface to take in a struct iomap_read_folio_ctx that is publicly defined as: struct iomap_read_folio_ctx { const struct iomap_read_ops *ops; struct folio *cur_folio; struct readahead_control *rac; void *read_ctx; }; where struct iomap_read_ops is defined as: struct iomap_read_ops { int (*read_folio_range)(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t len); void (*read_submit)(struct iomap_read_folio_ctx *ctx); }; read_folio_range() reads in the folio range and is required by the caller to provide. read_submit() is optional and is used for submitting any pending read requests. * Modifies existing filesystems that use iomap for read and readahead to use the new API, through the new statically inlined helpers iomap_bio_read_folio() and iomap_bio_readahead(). There is no change in functionality for those filesystems. Signed-off-by: Joanne Koong Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 44 ++++++++++++++++++ block/fops.c | 5 +- fs/erofs/data.c | 5 +- fs/gfs2/aops.c | 6 +-- fs/iomap/buffered-io.c | 55 +++++++++++----------- fs/xfs/xfs_aops.c | 5 +- fs/zonefs/file.c | 5 +- include/linux/iomap.h | 63 +++++++++++++++++++++++++- 8 files changed, 149 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 387fd9cc72ca..c88205132039 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -135,6 +135,28 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``. +``struct iomap_read_ops`` +-------------------------- + +.. code-block:: c + + struct iomap_read_ops { + int (*read_folio_range)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t len); + void (*submit_read)(struct iomap_read_folio_ctx *ctx); + }; + +iomap calls these functions: + + - ``read_folio_range``: Called to read in the range. This must be provided + by the caller. The caller is responsible for calling + iomap_finish_folio_read() after reading in the folio range. This should be + done even if an error is encountered during the read. This returns 0 on + success or a negative error on failure. + + - ``submit_read``: Submit any pending read requests. This function is + optional. + Internal per-Folio State ------------------------ @@ -182,6 +204,28 @@ The ``flags`` argument to ``->iomap_begin`` will be set to zero. The pagecache takes whatever locks it needs before calling the filesystem. +Both ``iomap_readahead`` and ``iomap_read_folio`` pass in a ``struct +iomap_read_folio_ctx``: + +.. code-block:: c + + struct iomap_read_folio_ctx { + const struct iomap_read_ops *ops; + struct folio *cur_folio; + struct readahead_control *rac; + void *read_ctx; + }; + +``iomap_readahead`` must set: + * ``ops->read_folio_range()`` and ``rac`` + +``iomap_read_folio`` must set: + * ``ops->read_folio_range()`` and ``cur_folio`` + +``ops->submit_read()`` and ``read_ctx`` are optional. ``read_ctx`` is used to +pass in any custom data the caller needs accessible in the ops callbacks for +fulfilling reads. + Buffered Writes --------------- diff --git a/block/fops.c b/block/fops.c index 5e3db9fead77..4dad9c2d5796 100644 --- a/block/fops.c +++ b/block/fops.c @@ -540,12 +540,13 @@ const struct address_space_operations def_blk_aops = { #else /* CONFIG_BUFFER_HEAD */ static int blkdev_read_folio(struct file *file, struct folio *folio) { - return iomap_read_folio(folio, &blkdev_iomap_ops); + iomap_bio_read_folio(folio, &blkdev_iomap_ops); + return 0; } static void blkdev_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &blkdev_iomap_ops); + iomap_bio_readahead(rac, &blkdev_iomap_ops); } static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc, diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8ca29962a3dd..bb13c4cb8455 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -371,7 +371,8 @@ static int erofs_read_folio(struct file *file, struct folio *folio) { trace_erofs_read_folio(folio, true); - return iomap_read_folio(folio, &erofs_iomap_ops); + iomap_bio_read_folio(folio, &erofs_iomap_ops); + return 0; } static void erofs_readahead(struct readahead_control *rac) @@ -379,7 +380,7 @@ static void erofs_readahead(struct readahead_control *rac) trace_erofs_readahead(rac->mapping->host, readahead_index(rac), readahead_count(rac), true); - return iomap_readahead(rac, &erofs_iomap_ops); + iomap_bio_readahead(rac, &erofs_iomap_ops); } static sector_t erofs_bmap(struct address_space *mapping, sector_t block) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 47d74afd63ac..38d4f343187a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -424,11 +424,11 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) struct inode *inode = folio->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; + int error = 0; if (!gfs2_is_jdata(ip) || (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { - error = iomap_read_folio(folio, &gfs2_iomap_ops); + iomap_bio_read_folio(folio, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { error = stuffed_read_folio(ip, folio); } else { @@ -503,7 +503,7 @@ static void gfs2_readahead(struct readahead_control *rac) else if (gfs2_is_jdata(ip)) mpage_readahead(rac, gfs2_block_map); else - iomap_readahead(rac, &gfs2_iomap_ops); + iomap_bio_readahead(rac, &gfs2_iomap_ops); } /** diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 12b23ff97000..d7100a5f953a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -328,8 +328,8 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, } #ifdef CONFIG_BLOCK -static void iomap_finish_folio_read(struct folio *folio, size_t off, - size_t len, int error) +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error) { struct iomap_folio_state *ifs = folio->private; bool uptodate = !error; @@ -349,6 +349,7 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off, if (finished) folio_end_read(folio, uptodate); } +EXPORT_SYMBOL_GPL(iomap_finish_folio_read); static void iomap_read_end_io(struct bio *bio) { @@ -360,12 +361,6 @@ static void iomap_read_end_io(struct bio *bio) bio_put(bio); } -struct iomap_read_folio_ctx { - struct folio *cur_folio; - void *read_ctx; - struct readahead_control *rac; -}; - static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) { struct bio *bio = ctx->read_ctx; @@ -374,7 +369,7 @@ static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) submit_bio(bio); } -static void iomap_bio_read_folio_range(const struct iomap_iter *iter, +static int iomap_bio_read_folio_range(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t plen) { struct folio *folio = ctx->cur_folio; @@ -412,8 +407,15 @@ static void iomap_bio_read_folio_range(const struct iomap_iter *iter, bio_add_folio_nofail(bio, folio, plen, poff); ctx->read_ctx = bio; } + return 0; } +const struct iomap_read_ops iomap_bio_read_ops = { + .read_folio_range = iomap_bio_read_folio_range, + .submit_read = iomap_bio_submit_read, +}; +EXPORT_SYMBOL_GPL(iomap_bio_read_ops); + static void iomap_read_init(struct folio *folio) { struct iomap_folio_state *ifs = folio->private; @@ -544,7 +546,9 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, if (!*bytes_pending) iomap_read_init(folio); *bytes_pending += plen; - iomap_bio_read_folio_range(iter, ctx, plen); + ret = ctx->ops->read_folio_range(iter, ctx, plen); + if (ret) + return ret; } ret = iomap_iter_advance(iter, plen); @@ -556,26 +560,25 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, return 0; } -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) +int iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx) { + struct folio *folio = ctx->cur_folio; struct iomap_iter iter = { .inode = folio->mapping->host, .pos = folio_pos(folio), .len = folio_size(folio), }; - struct iomap_read_folio_ctx ctx = { - .cur_folio = folio, - }; size_t bytes_pending = 0; int ret; trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_read_folio_iter(&iter, &ctx, - &bytes_pending); + iter.status = iomap_read_folio_iter(&iter, ctx, &bytes_pending); - iomap_bio_submit_read(&ctx); + if (ctx->ops->submit_read) + ctx->ops->submit_read(ctx); iomap_read_end(folio, bytes_pending); @@ -615,8 +618,8 @@ static int iomap_readahead_iter(struct iomap_iter *iter, /** * iomap_readahead - Attempt to read pages from a file. - * @rac: Describes the pages to be read. * @ops: The operations vector for the filesystem. + * @ctx: The ctx used for issuing readahead. * * This function is for filesystems to call to implement their readahead * address_space operation. @@ -628,28 +631,28 @@ static int iomap_readahead_iter(struct iomap_iter *iter, * function is called with memalloc_nofs set, so allocations will not cause * the filesystem to be reentered. */ -void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx) { + struct readahead_control *rac = ctx->rac; struct iomap_iter iter = { .inode = rac->mapping->host, .pos = readahead_pos(rac), .len = readahead_length(rac), }; - struct iomap_read_folio_ctx ctx = { - .rac = rac, - }; size_t cur_bytes_pending; trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.status = iomap_readahead_iter(&iter, &ctx, + iter.status = iomap_readahead_iter(&iter, ctx, &cur_bytes_pending); - iomap_bio_submit_read(&ctx); + if (ctx->ops->submit_read) + ctx->ops->submit_read(ctx); - if (ctx.cur_folio) - iomap_read_end(ctx.cur_folio, cur_bytes_pending); + if (ctx->cur_folio) + iomap_read_end(ctx->cur_folio, cur_bytes_pending); } EXPORT_SYMBOL_GPL(iomap_readahead); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a26f79815533..0c2ed00733f2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -742,14 +742,15 @@ xfs_vm_read_folio( struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &xfs_read_iomap_ops); + iomap_bio_read_folio(folio, &xfs_read_iomap_ops); + return 0; } STATIC void xfs_vm_readahead( struct readahead_control *rac) { - iomap_readahead(rac, &xfs_read_iomap_ops); + iomap_bio_readahead(rac, &xfs_read_iomap_ops); } static int diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 90e2ad8ee5f4..c1e5e30e90a0 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -112,12 +112,13 @@ static const struct iomap_ops zonefs_write_iomap_ops = { static int zonefs_read_folio(struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &zonefs_read_iomap_ops); + iomap_bio_read_folio(folio, &zonefs_read_iomap_ops); + return 0; } static void zonefs_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &zonefs_read_iomap_ops); + iomap_bio_readahead(rac, &zonefs_read_iomap_ops); } /* diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4469b2318b08..37435b912755 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -16,6 +16,7 @@ struct inode; struct iomap_iter; struct iomap_dio; struct iomap_writepage_ctx; +struct iomap_read_folio_ctx; struct iov_iter; struct kiocb; struct page; @@ -337,8 +338,10 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); -void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); +int iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); @@ -465,6 +468,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, loff_t pos, loff_t end_pos, unsigned int dirty_len); int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error); void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, @@ -473,6 +478,34 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio, int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio); int iomap_writepages(struct iomap_writepage_ctx *wpc); +struct iomap_read_folio_ctx { + const struct iomap_read_ops *ops; + struct folio *cur_folio; + struct readahead_control *rac; + void *read_ctx; +}; + +struct iomap_read_ops { + /* + * Read in a folio range. + * + * The caller is responsible for calling iomap_finish_folio_read() after + * reading in the folio range. This should be done even if an error is + * encountered during the read. + * + * Returns 0 on success or a negative error on failure. + */ + int (*read_folio_range)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t len); + + /* + * Submit any pending read requests. + * + * This is optional. + */ + void (*submit_read)(struct iomap_read_folio_ctx *ctx); +}; + /* * Flags for direct I/O ->end_io: */ @@ -538,4 +571,30 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, extern struct bio_set iomap_ioend_bioset; +#ifdef CONFIG_BLOCK +extern const struct iomap_read_ops iomap_bio_read_ops; + +static inline void iomap_bio_read_folio(struct folio *folio, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .cur_folio = folio, + }; + + iomap_read_folio(ops, &ctx); +} + +static inline void iomap_bio_readahead(struct readahead_control *rac, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .rac = rac, + }; + + iomap_readahead(ops, &ctx); +} +#endif /* CONFIG_BLOCK */ + #endif /* LINUX_IOMAP_H */ -- cgit v1.2.3 From d4e88bb08e5f7e6eb4e9c3685894b9b57bfdfb08 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:06 -0700 Subject: iomap: make iomap_read_folio() a void return No errors are propagated in iomap_read_folio(). Change iomap_read_folio() to a void return to make this clearer to callers. Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 9 +-------- include/linux/iomap.h | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 0d88a4f3c791..1dbcac17fefd 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -495,7 +495,7 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, return 0; } -int iomap_read_folio(const struct iomap_ops *ops, +void iomap_read_folio(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx) { struct folio *folio = ctx->cur_folio; @@ -516,13 +516,6 @@ int iomap_read_folio(const struct iomap_ops *ops, ctx->ops->submit_read(ctx); iomap_read_end(folio, bytes_pending); - - /* - * Just like mpage_readahead and block_read_full_folio, we always - * return 0 and just set the folio error flag on errors. This - * should be cleaned up throughout the stack eventually. - */ - return 0; } EXPORT_SYMBOL_GPL(iomap_read_folio); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 37435b912755..6d864b446b6e 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -338,7 +338,7 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -int iomap_read_folio(const struct iomap_ops *ops, +void iomap_read_folio(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); void iomap_readahead(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); -- cgit v1.2.3 From f8d98072feee32722086ddae4f288b6c45ae4330 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 3 Oct 2025 09:46:35 -0400 Subject: filemap: add helper to look up dirty folios in a range Add a new filemap_get_folios_dirty() helper to look up existing dirty folios in a range and add them to a folio_batch. This is to support optimization of certain iomap operations that only care about dirty folios in a target range. For example, zero range only zeroes the subset of dirty pages over unwritten mappings, seek hole/data may use similar logic in the future, etc. Note that the helper is intended for use under internal fs locks. Therefore it trylocks folios in order to filter out clean folios. This loosely follows the logic from filemap_range_has_writeback(). Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..7274a86b4871 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -977,6 +977,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); +unsigned filemap_get_folios_dirty(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); struct folio *read_cache_folio(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); diff --git a/mm/filemap.c b/mm/filemap.c index 13f0259d993c..da1be27de10d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2366,6 +2366,64 @@ out: } EXPORT_SYMBOL(filemap_get_folios_tag); +/** + * filemap_get_folios_dirty - Get a batch of dirty folios + * @mapping: The address_space to search + * @start: The starting folio index + * @end: The final folio index (inclusive) + * @fbatch: The batch to fill + * + * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except + * the returned folios are presumed to be dirty or undergoing writeback. Dirty + * state is presumed because we don't block on folio lock nor want to miss + * folios. Callers that need to can recheck state upon locking the folio. + * + * This may not return all dirty folios if the batch gets filled up. + * + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. + */ +unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, + pgoff_t end, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + if (xa_is_value(folio)) + continue; + if (folio_trylock(folio)) { + bool clean = !folio_test_dirty(folio) && + !folio_test_writeback(folio); + folio_unlock(folio); + if (clean) { + folio_put(folio); + continue; + } + } + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); + *start = folio->index + nr; + goto out; + } + } + /* + * We come here when there is no folio beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a folio at index -1 but that is + * already broke anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: -- cgit v1.2.3 From 395ed1ef0012e1bb1e4050e84ba0173b3623112a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 3 Oct 2025 09:46:37 -0400 Subject: iomap: optional zero range dirty folio processing The only way zero range can currently process unwritten mappings with dirty pagecache is to check whether the range is dirty before mapping lookup and then flush when at least one underlying mapping is unwritten. This ordering is required to prevent iomap lookup from racing with folio writeback and reclaim. Since zero range can skip ranges of unwritten mappings that are clean in cache, this operation can be improved by allowing the filesystem to provide a set of dirty folios that require zeroing. In turn, rather than flush or iterate file offsets, zero range can iterate on folios in the batch and advance over clean or uncached ranges in between. Add a folio_batch in struct iomap and provide a helper for filesystems to populate the batch at lookup time. Update the folio lookup path to return the next folio in the batch, if provided, and advance the iter if the folio starts beyond the current offset. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++--- fs/iomap/iter.c | 6 ++++ include/linux/iomap.h | 4 +++ 3 files changed, 95 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index b5e85cd24360..1cabd9b0249e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -772,6 +772,28 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); + if (iter->fbatch) { + struct folio *folio = folio_batch_next(iter->fbatch); + + if (!folio) + return NULL; + + /* + * The folio mapping generally shouldn't have changed based on + * fs locks, but be consistent with filemap lookup and retry + * the iter if it does. + */ + folio_lock(folio); + if (unlikely(folio->mapping != iter->inode->i_mapping)) { + iter->iomap.flags |= IOMAP_F_STALE; + folio_unlock(folio); + return NULL; + } + + folio_get(folio); + return folio; + } + if (write_ops && write_ops->get_folio) return write_ops->get_folio(iter, pos, len); return iomap_get_folio(iter, pos, len); @@ -832,6 +854,8 @@ static int iomap_write_begin(struct iomap_iter *iter, int status = 0; len = min_not_zero(len, *plen); + *foliop = NULL; + *plen = 0; if (fatal_signal_pending(current)) return -EINTR; @@ -840,6 +864,15 @@ static int iomap_write_begin(struct iomap_iter *iter, if (IS_ERR(folio)) return PTR_ERR(folio); + /* + * No folio means we're done with a batch. We still have range to + * process so return and let the caller iterate and refill the batch. + */ + if (!folio) { + WARN_ON_ONCE(!iter->fbatch); + return 0; + } + /* * Now we have a locked folio, before we do anything with it we need to * check that the iomap we have cached is not stale. The inode extent @@ -860,6 +893,22 @@ static int iomap_write_begin(struct iomap_iter *iter, } } + /* + * The folios in a batch may not be contiguous. If we've skipped + * forward, advance the iter to the pos of the current folio. If the + * folio starts beyond the end of the mapping, it may have been trimmed + * since the lookup for whatever reason. Return a NULL folio to + * terminate the op. + */ + if (folio_pos(folio) > iter->pos) { + len = min_t(u64, folio_pos(folio) - iter->pos, + iomap_length(iter)); + status = iomap_iter_advance(iter, len); + len = iomap_length(iter); + if (status || !len) + goto out_unlock; + } + pos = iomap_trim_folio_range(iter, folio, poffset, &len); if (srcmap->type == IOMAP_INLINE) @@ -1406,6 +1455,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, if (iter->iomap.flags & IOMAP_F_STALE) break; + /* a NULL folio means we're done with a folio batch */ + if (!folio) { + status = iomap_iter_advance_full(iter); + break; + } + /* warn about zeroing folios beyond eof that won't write back */ WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); @@ -1430,6 +1485,26 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, return status; } +loff_t +iomap_fill_dirty_folios( + struct iomap_iter *iter, + loff_t offset, + loff_t length) +{ + struct address_space *mapping = iter->inode->i_mapping; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + length - 1) >> PAGE_SHIFT; + + iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL); + if (!iter->fbatch) + return offset + length; + folio_batch_init(iter->fbatch); + + filemap_get_folios_dirty(mapping, &start, end, iter->fbatch); + return (start << PAGE_SHIFT); +} +EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios); + int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, @@ -1459,7 +1534,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, * flushing on partial eof zeroing, special case it to zero the * unaligned start portion if already dirty in pagecache. */ - if (off && + if (!iter.fbatch && off && filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { iter.len = plen; while ((ret = iomap_iter(&iter, ops)) > 0) @@ -1476,13 +1551,18 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, * if dirty and the fs returns a mapping that might convert on * writeback. */ - range_dirty = filemap_range_needs_writeback(inode->i_mapping, - iter.pos, iter.pos + iter.len - 1); + range_dirty = filemap_range_needs_writeback(mapping, iter.pos, + iter.pos + iter.len - 1); while ((ret = iomap_iter(&iter, ops)) > 0) { const struct iomap *srcmap = iomap_iter_srcmap(&iter); - if (srcmap->type == IOMAP_HOLE || - srcmap->type == IOMAP_UNWRITTEN) { + if (WARN_ON_ONCE(iter.fbatch && + srcmap->type != IOMAP_UNWRITTEN)) + return -EIO; + + if (!iter.fbatch && + (srcmap->type == IOMAP_HOLE || + srcmap->type == IOMAP_UNWRITTEN)) { s64 status; if (range_dirty) { diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 91d2024e00da..8692e5e41c6d 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -8,6 +8,12 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { + if (iter->fbatch) { + folio_batch_release(iter->fbatch); + kfree(iter->fbatch); + iter->fbatch = NULL; + } + iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6d864b446b6e..65d123114883 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -9,6 +9,7 @@ #include #include #include +#include struct address_space; struct fiemap_extent_info; @@ -242,6 +243,7 @@ struct iomap_iter { unsigned flags; struct iomap iomap; struct iomap srcmap; + struct folio_batch *fbatch; void *private; }; @@ -350,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops); +loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset, + loff_t length); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -- cgit v1.2.3 From 001397f5ef4908ea46a63059439e8c3bf3552d9f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 31 Oct 2025 14:10:26 +0100 Subject: iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag Btrfs requires all of its bios to be fs block aligned, normally it's totally fine but with the incoming block size larger than page size (bs > ps) support, the requirement is no longer met for direct IOs. Because iomap_dio_bio_iter() calls bio_iov_iter_get_pages(), only requiring alignment to be bdev_logical_block_size(). In the real world that value is either 512 or 4K, on 4K page sized systems it means bio_iov_iter_get_pages() can break the bio at any page boundary, breaking btrfs' requirement for bs > ps cases. To address this problem, introduce a new public iomap dio flag, IOMAP_DIO_FSBLOCK_ALIGNED. When calling __iomap_dio_rw() with that new flag, iomap_dio::flags will inherit that new flag, and iomap_dio_bio_iter() will take fs block size into the calculation of the alignment, and pass the alignment to bio_iov_iter_get_pages(), respecting the fs block size requirement. The initial user of this flag will be btrfs, which needs to calculate the checksum for direct read and thus requires the biovec to be fs block aligned for the incoming bs > ps support. Signed-off-by: Qu Wenruo Reviewed-by: Pankaj Raghav [hch: also align pos/len, incorporate the trace flags from Darrick] Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251031131045.1613229-2-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 17 +++++++++++++++-- fs/iomap/trace.h | 7 ++++--- include/linux/iomap.h | 8 ++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e9e5f0703160..8b2f9fb89eb3 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -336,8 +336,18 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) int nr_pages, ret = 0; u64 copied = 0; size_t orig_count; + unsigned int alignment; - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1)) + /* + * File systems that write out of place and always allocate new blocks + * need each bio to be block aligned as that's the unit of allocation. + */ + if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED) + alignment = fs_block_size; + else + alignment = bdev_logical_block_size(iomap->bdev); + + if ((pos | length) & (alignment - 1)) return -EINVAL; if (dio->flags & IOMAP_DIO_WRITE) { @@ -434,7 +444,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_end_io = iomap_dio_bio_end_io; ret = bio_iov_iter_get_pages(bio, dio->submit.iter, - bdev_logical_block_size(iomap->bdev) - 1); + alignment - 1); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall @@ -639,6 +649,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; + if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED) + dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index a61c1dae4742..532787277b16 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -122,9 +122,10 @@ DEFINE_RANGE_EVENT(iomap_zero_iter); #define IOMAP_DIO_STRINGS \ - {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ - {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ - {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } + {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ + {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ + {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }, \ + {IOMAP_DIO_FSBLOCK_ALIGNED, "DIO_FSBLOCK_ALIGNED" } DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 65d123114883..8b1ac08c7474 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -553,6 +553,14 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Ensure each bio is aligned to fs block size. + * + * For filesystems which need to calculate/verify the checksum of each fs + * block. Otherwise they may not be able to handle unaligned bios. + */ +#define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); -- cgit v1.2.3 From a50f7456f853ec3a6f07cbe1d16ad8a8b2501320 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 30 Oct 2025 14:05:27 +0000 Subject: dma-mapping: Allow use of DMA_BIT_MASK(64) in global scope Clang doesn't like that (1ULL<<(64)) overflows when initializing a global scope variable, even if that part of the ternary isn't used when n = 64. The same initialization can be done without warnings in function scopes, and GCC doesn't mind either way. The build failure that highlighted this was already fixed in a different way [1], which also has detailed links to the Clang issues. However it's not going to be long before the same thing happens again, so it's better to fix the root cause. Fix it by using GENMASK_ULL() which does exactly the same thing, is much more readable anyway, and doesn't have a shift that overflows. [1]: https://lore.kernel.org/all/20250918-mmp-pdma-simplify-dma-addressing-v1-1-5c2be2b85696@riscstar.com/ Signed-off-by: James Clark Reviewed-by: Nathan Chancellor Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251030-james-fix-dma_bit_mask-v1-1-ad1ce7cfab6e@linaro.org --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 8248ff9363ee..2ceda49c609f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -90,7 +90,7 @@ */ #define DMA_MAPPING_ERROR (~(dma_addr_t)0) -#define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) +#define DMA_BIT_MASK(n) GENMASK_ULL(n - 1, 0) struct dma_iova_state { dma_addr_t addr; -- cgit v1.2.3 From cf76553aaa363620f58a6b6409bf544f4bcfa8de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Nov 2025 11:00:14 +0100 Subject: entry,unwind/deferred: Fix unwind_reset_info() placement Stephen reported that on KASAN builds he's seeing: vmlinux.o: warning: objtool: user_exc_vmm_communication+0x15a: call to __kasan_check_read() leaves .noinstr.text section vmlinux.o: warning: objtool: exc_debug_user+0x182: call to __kasan_check_read() leaves .noinstr.text section vmlinux.o: warning: objtool: exc_int3+0x123: call to __kasan_check_read() leaves .noinstr.text section vmlinux.o: warning: objtool: noist_exc_machine_check+0x17a: call to __kasan_check_read() leaves .noinstr.text section vmlinux.o: warning: objtool: fred_exc_machine_check+0x17e: call to __kasan_check_read() leaves .noinstr.text section This turns out to be atomic ops from unwind_reset_info() that have explicit instrumentation. Place unwind_reset_info() in the preceding instrumentation_begin() section. Fixes: c6439bfaabf2 ("Merge tag 'trace-deferred-unwind-v6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace") Reported-by: Stephen Rothwell Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251105100014.GY4068168@noisy.programming.kicks-ass.net --- include/linux/irq-entry-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d643c7c87822..ba1ed42f8a1c 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -253,11 +253,11 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) static __always_inline void exit_to_user_mode(void) { instrumentation_begin(); + unwind_reset_info(); trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); - unwind_reset_info(); user_enter_irqoff(); arch_exit_to_user_mode(); lockdep_hardirqs_on(CALLER_ADDR0); -- cgit v1.2.3 From 8637fa89e678422995301ddb20b74190dffcccee Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:10 +0800 Subject: block: add __must_check attribute to sb_min_blocksize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sb_min_blocksize() returns 0 and the return value is not checked, it may lead to a situation where sb->s_blocksize is 0 when accessing the filesystem super block. After commit a64e5a596067bd ("bdev: add back PAGE_SIZE block size validation for sb_set_blocksize()"), this becomes more likely to happen when the block device’s logical_block_size is larger than PAGE_SIZE and the filesystem is unformatted. Add the __must_check attribute to ensure callers always check the return value. Cc: stable@vger.kernel.org # v6.15 Suggested-by: Matthew Wilcox Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-6-yangyongpeng.storage@gmail.com Signed-off-by: Christian Brauner --- block/bdev.c | 2 +- include/linux/fs.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/bdev.c b/block/bdev.c index 810707cca970..638f0cd458ae 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -231,7 +231,7 @@ int sb_set_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_set_blocksize); -int sb_min_blocksize(struct super_block *sb, int size) +int __must_check sb_min_blocksize(struct super_block *sb, int size) { int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..3ea98c6cce81 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3423,8 +3423,8 @@ static inline void remove_inode_hash(struct inode *inode) extern void inode_sb_list_add(struct inode *inode); extern void inode_add_lru(struct inode *inode); -extern int sb_set_blocksize(struct super_block *, int); -extern int sb_min_blocksize(struct super_block *, int); +int sb_set_blocksize(struct super_block *sb, int size); +int __must_check sb_min_blocksize(struct super_block *sb, int size); int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); -- cgit v1.2.3 From ae83f3b72621bd3187eb7956c7c2993a97d4b187 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 9 Oct 2025 20:06:09 -0700 Subject: module: Add compile-time check for embedded NUL characters Long ago, the kernel module license checks were bypassed by embedding a NUL character in the MODULE_LICENSE() string[1]. By using a string like "GPL\0proprietary text", the kernel would only read "GPL" due to C string termination at the NUL byte, allowing proprietary modules to avoid kernel tainting and access GPL-only symbols. The MODULE_INFO() macro stores these strings in the .modinfo ELF section, and get_next_modinfo() uses strcmp()-family functions which stop at the first NUL. This split the embedded string into two separate .modinfo entries, with only the first part being processed by license_is_gpl_compatible(). Add a compile-time check using static_assert that compares the full string length (sizeof - 1) against __builtin_strlen(), which stops at the first NUL. If they differ, compilation fails with a clear error message. While this check can still be circumvented by modifying the ELF binary post-compilation, it prevents accidental embedded NULs and forces intentional abuse to require deliberate binary manipulation rather than simple source-level tricks. Build tested with test modules containing both valid and invalid license strings. The check correctly rejects: MODULE_LICENSE("GPL\0proprietary") while accepting normal declarations: MODULE_LICENSE("GPL") Link: https://lwn.net/Articles/82305/ [1] Suggested-by: Rusty Russell Signed-off-by: Kees Cook Reviewed-by: Daniel Gomez Reviewed-by: Aaron Tomlin Reviewed-by: Petr Pavlu Tested-by: Daniel Gomez Signed-off-by: Daniel Gomez --- include/linux/moduleparam.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 6907aedc4f74..915f32f7d888 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -26,6 +26,9 @@ /* Generic info of form tag = "info" */ #define MODULE_INFO(tag, info) \ + static_assert( \ + sizeof(info) - 1 == __builtin_strlen(info), \ + "MODULE_INFO(" #tag ", ...) contains embedded NUL byte"); \ static const char __UNIQUE_ID(modinfo)[] \ __used __section(".modinfo") __aligned(1) \ = __MODULE_INFO_PREFIX __stringify(tag) "=" info -- cgit v1.2.3 From 3c36965df80801344850388592e95033eceea05b Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Mon, 27 Oct 2025 12:05:22 +0100 Subject: regulator: Add support for MediaTek MT6363 SPMI PMIC Regulators Add a driver for the regulators found on the MT6363 PMIC, fully controlled by SPMI interface. This PMIC regulates voltage with an input range of 2.6-5.0V, and features 10 buck converters and 26 LDOs. Signed-off-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20251027110527.21002-5-angelogioacchino.delregno@collabora.com Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 10 + drivers/regulator/Makefile | 1 + drivers/regulator/mt6363-regulator.c | 938 +++++++++++++++++++++++++++++ include/linux/regulator/mt6363-regulator.h | 330 ++++++++++ 4 files changed, 1279 insertions(+) create mode 100644 drivers/regulator/mt6363-regulator.c create mode 100644 include/linux/regulator/mt6363-regulator.h (limited to 'include/linux') diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 961b1a946346..99f2fdc62eee 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -954,6 +954,16 @@ config REGULATOR_MT6360 2-channel buck with Thermal Shutdown and Overload Protection 6-channel High PSRR and Low Dropout LDO. +config REGULATOR_MT6363 + tristate "MT6363 SPMI PMIC regulator driver" + depends on SPMI + select REGMAP_SPMI + help + Say Y here to enable support for regulators found in the MediaTek + MT6363 SPMI PMIC. + This driver supports the control of different power rails of device + through regulator interface. + config REGULATOR_MT6370 tristate "MT6370 SubPMIC Regulator" depends on MFD_MT6370 diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index d5c3da9ecdb4..8f2750d3df33 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -113,6 +113,7 @@ obj-$(CONFIG_REGULATOR_MT6357) += mt6357-regulator.o obj-$(CONFIG_REGULATOR_MT6358) += mt6358-regulator.o obj-$(CONFIG_REGULATOR_MT6359) += mt6359-regulator.o obj-$(CONFIG_REGULATOR_MT6360) += mt6360-regulator.o +obj-$(CONFIG_REGULATOR_MT6363) += mt6363-regulator.o obj-$(CONFIG_REGULATOR_MT6370) += mt6370-regulator.o obj-$(CONFIG_REGULATOR_MT6380) += mt6380-regulator.o obj-$(CONFIG_REGULATOR_MT6397) += mt6397-regulator.o diff --git a/drivers/regulator/mt6363-regulator.c b/drivers/regulator/mt6363-regulator.c new file mode 100644 index 000000000000..94ac955afa45 --- /dev/null +++ b/drivers/regulator/mt6363-regulator.c @@ -0,0 +1,938 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Copyright (c) 2024 MediaTek Inc. +// Copyright (c) 2025 Collabora Ltd +// AngeloGioacchino Del Regno + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define MT6363_REGULATOR_MODE_NORMAL 0 +#define MT6363_REGULATOR_MODE_FCCM 1 +#define MT6363_REGULATOR_MODE_LP 2 +#define MT6363_REGULATOR_MODE_ULP 3 + +#define EN_SET_OFFSET 0x1 +#define EN_CLR_OFFSET 0x2 +#define OP_CFG_OFFSET 0x5 + +#define NORMAL_OP_CFG 0x10 +#define NORMAL_OP_EN 0x800000 + +#define OC_IRQ_ENABLE_DELAY_MS 10 + +/* Unlock keys for TMA and BUCK_TOP */ +#define MT6363_TMA_UNLOCK_VALUE 0x9c9c +#define MT6363_BUCK_TOP_UNLOCK_VALUE 0x5543 + +enum { + MT6363_ID_VBUCK1, + MT6363_ID_VBUCK2, + MT6363_ID_VBUCK3, + MT6363_ID_VBUCK4, + MT6363_ID_VBUCK5, + MT6363_ID_VBUCK6, + MT6363_ID_VBUCK7, + MT6363_ID_VS1, + MT6363_ID_VS2, + MT6363_ID_VS3, + MT6363_ID_VA12_1, + MT6363_ID_VA12_2, + MT6363_ID_VA15, + MT6363_ID_VAUX18, + MT6363_ID_VCN13, + MT6363_ID_VCN15, + MT6363_ID_VEMC, + MT6363_ID_VIO075, + MT6363_ID_VIO18, + MT6363_ID_VM18, + MT6363_ID_VSRAM_APU, + MT6363_ID_VSRAM_CPUB, + MT6363_ID_VSRAM_CPUM, + MT6363_ID_VSRAM_CPUL, + MT6363_ID_VSRAM_DIGRF, + MT6363_ID_VSRAM_MDFE, + MT6363_ID_VSRAM_MODEM, + MT6363_ID_VRF09, + MT6363_ID_VRF12, + MT6363_ID_VRF13, + MT6363_ID_VRF18, + MT6363_ID_VRFIO18, + MT6363_ID_VTREF18, + MT6363_ID_VUFS12, + MT6363_ID_VUFS18, +}; + +/** + * struct mt6363_regulator_info - MT6363 regulators information + * @desc: Regulator description structure + * @lp_mode_reg: Low Power mode register (normal/idle) + * @lp_mode_mask: Low Power mode regulator mask + * @hw_lp_mode_reg: Hardware voted Low Power mode register (normal/idle) + * @hw_lp_mode_mask: Hardware voted Low Power mode regulator mask + * @modeset_reg: AUTO/PWM mode register + * @modeset_mask: AUTO/PWM regulator mask + * @lp_imax_uA: Maximum load current (microamps), for Low Power mode only + * @op_en_reg: Operation mode enablement register + * @orig_op_en: Backup of a regulator's operation mode enablement register + * @orig_op_cfg: Backup of a regulator's operation mode configuration register + * @oc_work: Delayed work for enabling overcurrent IRQ + * @hwirq: PMIC-Internal HW Interrupt for overcurrent event + * @virq: Mapped Interrupt for overcurrent event + */ +struct mt6363_regulator_info { + struct regulator_desc desc; + u16 lp_mode_reg; + u16 lp_mode_mask; + u16 hw_lp_mode_reg; + u16 hw_lp_mode_mask; + u16 modeset_reg; + u16 modeset_mask; + int lp_imax_uA; + u16 op_en_reg; + u32 orig_op_en; + u8 orig_op_cfg; + struct delayed_work oc_work; + u8 hwirq; + int virq; +}; + +#define MT6363_BUCK(match, vreg, min, max, step, en_reg, lp_reg, \ + mset_reg, ocp_intn) \ +[MT6363_ID_##vreg] = { \ + .desc = { \ + .name = match, \ + .supply_name = "vsys-"match, \ + .of_match = of_match_ptr(match), \ + .ops = &mt6363_vreg_setclr_ops, \ + .type = REGULATOR_VOLTAGE, \ + .id = MT6363_ID_##vreg, \ + .owner = THIS_MODULE, \ + .n_voltages = (max - min) / step + 1, \ + .min_uV = min, \ + .uV_step = step, \ + .enable_reg = en_reg, \ + .enable_mask = BIT(MT6363_RG_BUCK_##vreg##_EN_BIT), \ + .vsel_reg = MT6363_RG_BUCK_##vreg##_VOSEL_ADDR, \ + .vsel_mask = MT6363_RG_BUCK_##vreg##_VOSEL_MASK, \ + .of_map_mode = mt6363_map_mode, \ + }, \ + .lp_mode_reg = lp_reg, \ + .lp_mode_mask = BIT(MT6363_RG_BUCK_##vreg##_LP_BIT), \ + .hw_lp_mode_reg = MT6363_BUCK_##vreg##_HW_LP_MODE, \ + .hw_lp_mode_mask = 0xc, \ + .modeset_reg = mset_reg, \ + .modeset_mask = BIT(MT6363_RG_##vreg##_FCCM_BIT), \ + .lp_imax_uA = 100000, \ + .op_en_reg = MT6363_BUCK_##vreg##_OP_EN_0, \ + .hwirq = ocp_intn, \ +} + +#define MT6363_LDO_LINEAR_OPS(match, vreg, in_sup, vops, min, max, \ + step, buck_reg, ocp_intn) \ +[MT6363_ID_##vreg] = { \ + .desc = { \ + .name = match, \ + .supply_name = in_sup, \ + .of_match = of_match_ptr(match), \ + .ops = &vops, \ + .type = REGULATOR_VOLTAGE, \ + .id = MT6363_ID_##vreg, \ + .owner = THIS_MODULE, \ + .n_voltages = (max - min) / step + 1, \ + .min_uV = min, \ + .uV_step = step, \ + .enable_reg = MT6363_RG_##buck_reg##_EN_ADDR, \ + .enable_mask = BIT(MT6363_RG_LDO_##vreg##_EN_BIT), \ + .vsel_reg = MT6363_RG_LDO_##vreg##_VOSEL_ADDR, \ + .vsel_mask = MT6363_RG_LDO_##vreg##_VOSEL_MASK, \ + .of_map_mode = mt6363_map_mode, \ + }, \ + .lp_mode_reg = MT6363_RG_##buck_reg##_LP_ADDR, \ + .lp_mode_mask = BIT(MT6363_RG_LDO_##vreg##_LP_BIT), \ + .hw_lp_mode_reg = MT6363_LDO_##vreg##_HW_LP_MODE, \ + .hw_lp_mode_mask = 0x4, \ + .hwirq = ocp_intn, \ +} + +#define MT6363_LDO_L_SC(match, vreg, inp, min, max, step, buck_reg, \ + ocp_intn) \ + MT6363_LDO_LINEAR_OPS(match, vreg, inp, mt6363_vreg_setclr_ops, \ + min, max, step, buck_reg, ocp_intn) + +#define MT6363_LDO_L(match, vreg, inp, min, max, step, buck_reg, \ + ocp_intn) \ + MT6363_LDO_LINEAR_OPS(match, vreg, inp, mt6363_ldo_linear_ops, \ + min, max, step, buck_reg, ocp_intn) + +#define MT6363_LDO_LINEAR_CAL_OPS(match, vreg, in_sup, vops, vrnum, \ + ocp_intn) \ +[MT6363_ID_##vreg] = { \ + .desc = { \ + .name = match, \ + .supply_name = in_sup, \ + .of_match = of_match_ptr(match), \ + .ops = &vops, \ + .type = REGULATOR_VOLTAGE, \ + .id = MT6363_ID_##vreg, \ + .owner = THIS_MODULE, \ + .n_voltages = ARRAY_SIZE(ldo_volt_ranges##vrnum) * 11, \ + .linear_ranges = ldo_volt_ranges##vrnum, \ + .n_linear_ranges = ARRAY_SIZE(ldo_volt_ranges##vrnum), \ + .linear_range_selectors_bitfield = ldos_cal_selectors, \ + .enable_reg = MT6363_RG_LDO_##vreg##_ADDR, \ + .enable_mask = BIT(MT6363_RG_LDO_##vreg##_EN_BIT), \ + .vsel_reg = MT6363_RG_##vreg##_VOCAL_ADDR, \ + .vsel_mask = MT6363_RG_##vreg##_VOCAL_MASK, \ + .vsel_range_reg = MT6363_RG_##vreg##_VOSEL_ADDR, \ + .vsel_range_mask = MT6363_RG_##vreg##_VOSEL_MASK, \ + .of_map_mode = mt6363_map_mode, \ + }, \ + .lp_mode_reg = MT6363_RG_LDO_##vreg##_ADDR, \ + .lp_mode_mask = BIT(MT6363_RG_LDO_##vreg##_LP_BIT), \ + .hw_lp_mode_reg = MT6363_LDO_##vreg##_HW_LP_MODE, \ + .hw_lp_mode_mask = 0x4, \ + .lp_imax_uA = 10000, \ + .op_en_reg = MT6363_LDO_##vreg##_OP_EN0, \ + .hwirq = ocp_intn, \ +} + +#define MT6363_LDO_VT(match, vreg, inp, vranges_num, ocp_intn) \ + MT6363_LDO_LINEAR_CAL_OPS(match, vreg, inp, mt6363_ldo_vtable_ops,\ + vranges_num, ocp_intn) + +static const unsigned int ldos_cal_selectors[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +static const struct linear_range ldo_volt_ranges0[] = { + REGULATOR_LINEAR_RANGE(1200000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1300000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1500000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1700000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1800000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2000000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2500000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2600000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2700000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2800000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2900000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(3000000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(3100000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(3300000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(3400000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(3500000, 0, 10, 10000) +}; + +static const struct linear_range ldo_volt_ranges1[] = { + REGULATOR_LINEAR_RANGE(900000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1000000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1100000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1200000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1300000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1700000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1800000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1810000, 0, 10, 10000) +}; + +static const struct linear_range ldo_volt_ranges2[] = { + REGULATOR_LINEAR_RANGE(1800000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1900000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2000000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2100000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2200000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2300000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2400000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2500000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2600000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2700000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2800000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2900000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(3000000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(3100000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(3200000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(3300000, 0, 10, 10000) +}; + +static const struct linear_range ldo_volt_ranges3[] = { + REGULATOR_LINEAR_RANGE(600000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(700000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(800000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(900000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1000000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1100000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1200000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1300000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1400000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1500000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1600000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1700000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1800000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(1900000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2000000, 0, 10, 10000), + REGULATOR_LINEAR_RANGE(2100000, 0, 10, 10000) +}; + +static const struct linear_range ldo_volt_ranges4[] = { + REGULATOR_LINEAR_RANGE(550000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(600000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(650000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(700000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(750000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(800000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(900000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(950000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(1000000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(1050000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(1100000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(1150000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(1700000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(1750000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(1800000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(1850000, 0, 10, 5000) +}; + +static const struct linear_range ldo_volt_ranges5[] = { + REGULATOR_LINEAR_RANGE(600000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(650000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(700000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(750000, 0, 10, 5000), + REGULATOR_LINEAR_RANGE(800000, 0, 10, 5000) +}; + +static int mt6363_vreg_enable_setclr(struct regulator_dev *rdev) +{ + return regmap_write(rdev->regmap, rdev->desc->enable_reg + EN_SET_OFFSET, + rdev->desc->enable_mask); +} + +static int mt6363_vreg_disable_setclr(struct regulator_dev *rdev) +{ + return regmap_write(rdev->regmap, rdev->desc->enable_reg + EN_CLR_OFFSET, + rdev->desc->enable_mask); +} + +static inline unsigned int mt6363_map_mode(unsigned int mode) +{ + switch (mode) { + case MT6363_REGULATOR_MODE_NORMAL: + return REGULATOR_MODE_NORMAL; + case MT6363_REGULATOR_MODE_FCCM: + return REGULATOR_MODE_FAST; + case MT6363_REGULATOR_MODE_LP: + return REGULATOR_MODE_IDLE; + case MT6363_REGULATOR_MODE_ULP: + return REGULATOR_MODE_STANDBY; + default: + return REGULATOR_MODE_INVALID; + } +} + +static unsigned int mt6363_regulator_get_mode(struct regulator_dev *rdev) +{ + struct mt6363_regulator_info *info = rdev_get_drvdata(rdev); + unsigned int val; + int ret; + + if (info->modeset_reg) { + ret = regmap_read(rdev->regmap, info->modeset_reg, &val); + if (ret) { + dev_err(&rdev->dev, "Failed to get mt6363 mode: %d\n", ret); + return ret; + } + + if (val & info->modeset_mask) + return REGULATOR_MODE_FAST; + } else { + val = 0; + }; + + ret = regmap_read(rdev->regmap, info->hw_lp_mode_reg, &val); + val &= info->hw_lp_mode_mask; + + if (ret) { + dev_err(&rdev->dev, "Failed to get lp mode: %d\n", ret); + return ret; + } + + if (val) + return REGULATOR_MODE_IDLE; + else + return REGULATOR_MODE_NORMAL; +} + +static int mt6363_buck_unlock(struct regmap *map, bool unlock) +{ + u16 buf = unlock ? MT6363_BUCK_TOP_UNLOCK_VALUE : 0; + + return regmap_bulk_write(map, MT6363_BUCK_TOP_KEY_PROT_LO, &buf, sizeof(buf)); +} + +static int mt6363_regulator_set_mode(struct regulator_dev *rdev, + unsigned int mode) +{ + struct mt6363_regulator_info *info = rdev_get_drvdata(rdev); + struct regmap *regmap = rdev->regmap; + int cur_mode, ret; + + if (!info->modeset_reg && mode == REGULATOR_MODE_FAST) + return -EOPNOTSUPP; + + switch (mode) { + case REGULATOR_MODE_FAST: + ret = mt6363_buck_unlock(regmap, true); + if (ret) + break; + + ret = regmap_set_bits(regmap, info->modeset_reg, info->modeset_mask); + + mt6363_buck_unlock(regmap, false); + break; + case REGULATOR_MODE_NORMAL: + cur_mode = mt6363_regulator_get_mode(rdev); + if (cur_mode < 0) { + ret = cur_mode; + break; + } + + if (cur_mode == REGULATOR_MODE_FAST) { + ret = mt6363_buck_unlock(regmap, true); + if (ret) + break; + + ret = regmap_clear_bits(regmap, info->modeset_reg, info->modeset_mask); + + mt6363_buck_unlock(regmap, false); + break; + } else if (cur_mode == REGULATOR_MODE_IDLE) { + ret = regmap_clear_bits(regmap, info->lp_mode_reg, info->lp_mode_mask); + if (ret == 0) + usleep_range(100, 200); + } else { + ret = 0; + } + break; + case REGULATOR_MODE_IDLE: + ret = regmap_set_bits(regmap, info->lp_mode_reg, info->lp_mode_mask); + break; + default: + ret = -EINVAL; + } + + if (ret) { + dev_err(&rdev->dev, "Failed to set mode %u: %d\n", mode, ret); + return ret; + } + + return 0; +} + +static int mt6363_regulator_set_load(struct regulator_dev *rdev, int load_uA) +{ + struct mt6363_regulator_info *info = rdev_get_drvdata(rdev); + unsigned int opmode_cfg, opmode_en; + int i, ret; + + if (!info->lp_imax_uA) + return -EINVAL; + + if (load_uA >= info->lp_imax_uA) { + ret = mt6363_regulator_set_mode(rdev, REGULATOR_MODE_NORMAL); + if (ret) + return ret; + + opmode_cfg = NORMAL_OP_CFG; + opmode_en = NORMAL_OP_EN; + } else { + opmode_cfg = info->orig_op_cfg; + opmode_en = info->orig_op_en; + } + + ret = regmap_write(rdev->regmap, info->op_en_reg + OP_CFG_OFFSET, opmode_cfg); + if (ret) + return ret; + + for (i = 0; i < 3; i++) { + ret = regmap_write(rdev->regmap, info->op_en_reg + i, + (opmode_en >> (i * 8)) & GENMASK(7, 0)); + if (ret) + return ret; + } + + return 0; +} + +static int mt6363_vemc_set_voltage_sel(struct regulator_dev *rdev, unsigned int sel) +{ + const u16 tma_unlock_key = MT6363_TMA_UNLOCK_VALUE; + const struct regulator_desc *rdesc = rdev->desc; + struct regmap *regmap = rdev->regmap; + unsigned int range, val; + int i, ret; + u16 mask; + + for (i = 0; i < rdesc->n_linear_ranges; i++) { + const struct linear_range *r = &rdesc->linear_ranges[i]; + unsigned int voltages_in_range = linear_range_values_in_range(r); + + if (sel < voltages_in_range) + break; + sel -= voltages_in_range; + } + + if (i == rdesc->n_linear_ranges) + return -EINVAL; + + ret = regmap_read(rdev->regmap, MT6363_TOP_TRAP, &val); + if (ret) + return ret; + + if (val > 1) + return -EINVAL; + + /* Unlock TMA for writing */ + ret = regmap_bulk_write(rdev->regmap, MT6363_TOP_TMA_KEY_L, + &tma_unlock_key, sizeof(tma_unlock_key)); + if (ret) + return ret; + + /* If HW trapping value is 1, use VEMC_VOSEL_1 instead of VEMC_VOSEL_0 */ + if (val == 1) { + mask = MT6363_RG_VEMC_VOSEL_1_MASK; + sel = FIELD_PREP(MT6363_RG_VEMC_VOSEL_1_MASK, sel); + } else { + mask = rdesc->vsel_mask; + } + + sel <<= ffs(rdesc->vsel_mask) - 1; + sel += rdesc->linear_ranges[i].min_sel; + + range = rdesc->linear_range_selectors_bitfield[i]; + range <<= ffs(rdesc->vsel_range_mask) - 1; + + /* Write to the vreg calibration register for voltage finetuning */ + ret = regmap_update_bits(regmap, rdesc->vsel_range_reg, + rdesc->vsel_range_mask, range); + if (ret) + goto lock_tma; + + /* Function must return the result of this write operation */ + ret = regmap_update_bits(regmap, rdesc->vsel_reg, mask, sel); + +lock_tma: + /* Unconditionally re-lock TMA */ + val = 0; + regmap_bulk_write(rdev->regmap, MT6363_TOP_TMA_KEY_L, &val, 2); + + return ret; +} + +static int mt6363_vemc_get_voltage_sel(struct regulator_dev *rdev) +{ + const struct regulator_desc *rdesc = rdev->desc; + unsigned int vosel, trap, calsel; + int vcal, vsel, range, ret; + + ret = regmap_read(rdev->regmap, rdesc->vsel_reg, &vosel); + if (ret) + return ret; + + ret = regmap_read(rdev->regmap, rdesc->vsel_range_reg, &calsel); + if (ret) + return ret; + + calsel &= rdesc->vsel_range_mask; + for (range = 0; range < rdesc->n_linear_ranges; range++) + if (rdesc->linear_range_selectors_bitfield[range] != calsel) + break; + + if (range == rdesc->n_linear_ranges) + return -EINVAL; + + ret = regmap_read(rdev->regmap, MT6363_TOP_TRAP, &trap); + if (ret) + return ret; + + /* If HW trapping value is 1, use VEMC_VOSEL_1 instead of VEMC_VOSEL_0 */ + if (trap > 1) + return -EINVAL; + else if (trap == 1) + vsel = FIELD_GET(MT6363_RG_VEMC_VOSEL_1_MASK, vosel); + else + vsel = vosel & rdesc->vsel_mask; + + vcal = linear_range_values_in_range_array(rdesc->linear_ranges, range); + + return vsel + vcal; +} + +static int mt6363_va15_set_voltage_sel(struct regulator_dev *rdev, unsigned int sel) +{ + struct regmap *regmap = rdev->regmap; + int ret; + + ret = mt6363_buck_unlock(regmap, true); + if (ret) + return ret; + + ret = regulator_set_voltage_sel_pickable_regmap(rdev, sel); + if (ret) + goto va15_unlock; + + ret = regmap_update_bits(regmap, MT6363_RG_BUCK_EFUSE_RSV1, + MT6363_RG_BUCK_EFUSE_RSV1_MASK, sel); + if (ret) + goto va15_unlock; + +va15_unlock: + mt6363_buck_unlock(rdev->regmap, false); + return ret; +} + +static void mt6363_oc_irq_enable_work(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct mt6363_regulator_info *info = + container_of(dwork, struct mt6363_regulator_info, oc_work); + + enable_irq(info->virq); +} + +static irqreturn_t mt6363_oc_isr(int irq, void *data) +{ + struct regulator_dev *rdev = (struct regulator_dev *)data; + struct mt6363_regulator_info *info = rdev_get_drvdata(rdev); + + disable_irq_nosync(info->virq); + + if (regulator_is_enabled_regmap(rdev)) + regulator_notifier_call_chain(rdev, REGULATOR_EVENT_OVER_CURRENT, NULL); + + schedule_delayed_work(&info->oc_work, msecs_to_jiffies(OC_IRQ_ENABLE_DELAY_MS)); + + return IRQ_HANDLED; +} + +static int mt6363_set_ocp(struct regulator_dev *rdev, int lim, int severity, bool enable) +{ + struct mt6363_regulator_info *info = rdev_get_drvdata(rdev); + + /* MT6363 supports only enabling protection and does not support limits */ + if (lim || severity != REGULATOR_SEVERITY_PROT || !enable) + return -EOPNOTSUPP; + + /* If there is no OCP interrupt, there's nothing to set */ + if (info->virq <= 0) + return -EOPNOTSUPP; + + return devm_request_threaded_irq(&rdev->dev, info->virq, NULL, + mt6363_oc_isr, IRQF_ONESHOT, + info->desc.name, rdev); +} + +static const struct regulator_ops mt6363_vreg_setclr_ops = { + .list_voltage = regulator_list_voltage_linear, + .map_voltage = regulator_map_voltage_linear, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .enable = mt6363_vreg_enable_setclr, + .disable = mt6363_vreg_disable_setclr, + .is_enabled = regulator_is_enabled_regmap, + .set_mode = mt6363_regulator_set_mode, + .get_mode = mt6363_regulator_get_mode, + .set_load = mt6363_regulator_set_load, + .set_over_current_protection = mt6363_set_ocp, +}; + +static const struct regulator_ops mt6363_ldo_linear_ops = { + .list_voltage = regulator_list_voltage_linear, + .map_voltage = regulator_map_voltage_linear, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .set_mode = mt6363_regulator_set_mode, + .get_mode = mt6363_regulator_get_mode, + .set_over_current_protection = mt6363_set_ocp, +}; + +static const struct regulator_ops mt6363_ldo_vtable_ops = { + .list_voltage = regulator_list_voltage_pickable_linear_range, + .map_voltage = regulator_map_voltage_pickable_linear_range, + .set_voltage_sel = regulator_set_voltage_sel_pickable_regmap, + .get_voltage_sel = regulator_get_voltage_sel_pickable_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .set_mode = mt6363_regulator_set_mode, + .get_mode = mt6363_regulator_get_mode, + .set_load = mt6363_regulator_set_load, + .set_over_current_protection = mt6363_set_ocp, +}; + +static const struct regulator_ops mt6363_ldo_vemc_ops = { + .list_voltage = regulator_list_voltage_pickable_linear_range, + .map_voltage = regulator_map_voltage_pickable_linear_range, + .set_voltage_sel = mt6363_vemc_set_voltage_sel, + .get_voltage_sel = mt6363_vemc_get_voltage_sel, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .set_mode = mt6363_regulator_set_mode, + .get_mode = mt6363_regulator_get_mode, + .set_load = mt6363_regulator_set_load, + .set_over_current_protection = mt6363_set_ocp, +}; + +static const struct regulator_ops mt6363_ldo_va15_ops = { + .list_voltage = regulator_list_voltage_pickable_linear_range, + .map_voltage = regulator_map_voltage_pickable_linear_range, + .set_voltage_sel = mt6363_va15_set_voltage_sel, + .get_voltage_sel = regulator_get_voltage_sel_pickable_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .set_mode = mt6363_regulator_set_mode, + .get_mode = mt6363_regulator_get_mode, + .set_load = mt6363_regulator_set_load, + .set_over_current_protection = mt6363_set_ocp, +}; + +/* The array is indexed by id(MT6363_ID_XXX) */ +static struct mt6363_regulator_info mt6363_regulators[] = { + MT6363_BUCK("vbuck1", VBUCK1, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR, + MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_FCCM_ADDR, 1), + MT6363_BUCK("vbuck2", VBUCK2, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR, + MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_FCCM_ADDR, 2), + MT6363_BUCK("vbuck3", VBUCK3, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR, + MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_FCCM_ADDR, 3), + MT6363_BUCK("vbuck4", VBUCK4, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR, + MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_1_FCCM_ADDR, 4), + MT6363_BUCK("vbuck5", VBUCK5, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR, + MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_1_FCCM_ADDR, 5), + MT6363_BUCK("vbuck6", VBUCK6, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR, + MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_1_FCCM_ADDR, 6), + MT6363_BUCK("vbuck7", VBUCK7, 0, 1193750, 6250, MT6363_RG_BUCK0_EN_ADDR, + MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_1_FCCM_ADDR, 7), + MT6363_BUCK("vs1", VS1, 0, 2200000, 12500, MT6363_RG_BUCK1_EN_ADDR, + MT6363_RG_BUCK1_LP_ADDR, MT6363_RG_VS1_FCCM_ADDR, 8), + MT6363_BUCK("vs2", VS2, 0, 1600000, 12500, MT6363_RG_BUCK0_EN_ADDR, + MT6363_RG_BUCK0_LP_ADDR, MT6363_RG_BUCK0_FCCM_ADDR, 0), + MT6363_BUCK("vs3", VS3, 0, 1193750, 6250, MT6363_RG_BUCK1_EN_ADDR, + MT6363_RG_BUCK1_LP_ADDR, MT6363_RG_VS3_FCCM_ADDR, 9), + MT6363_LDO_VT("va12-1", VA12_1, "vs2-ldo2", 3, 37), + MT6363_LDO_VT("va12-2", VA12_2, "vs2-ldo2", 3, 38), + MT6363_LDO_LINEAR_CAL_OPS("va15", VA15, "vs1-ldo1", mt6363_ldo_va15_ops, 3, 39), + MT6363_LDO_VT("vaux18", VAUX18, "vsys-ldo1", 2, 31), + MT6363_LDO_VT("vcn13", VCN13, "vs2-ldo2", 1, 17), + MT6363_LDO_VT("vcn15", VCN15, "vs1-ldo2", 3, 16), + MT6363_LDO_LINEAR_CAL_OPS("vemc", VEMC, "vsys-ldo1", mt6363_ldo_vemc_ops, 0, 32), + MT6363_LDO_VT("vio0p75", VIO075, "vs1-ldo1", 5, 36), + MT6363_LDO_VT("vio18", VIO18, "vs1-ldo2", 3, 35), + MT6363_LDO_VT("vm18", VM18, "vs1-ldo1", 4, 40), + MT6363_LDO_L("vsram-apu", VSRAM_APU, "vs3-ldo1", 400000, 1193750, 6250, BUCK1, 30), + MT6363_LDO_L("vsram-cpub", VSRAM_CPUB, "vs2-ldo1", 400000, 1193750, 6250, BUCK1, 27), + MT6363_LDO_L("vsram-cpum", VSRAM_CPUM, "vs2-ldo1", 400000, 1193750, 6250, BUCK1, 28), + MT6363_LDO_L("vsram-cpul", VSRAM_CPUL, "vs2-ldo2", 400000, 1193750, 6250, BUCK1, 29), + MT6363_LDO_L_SC("vsram-digrf", VSRAM_DIGRF, "vs3-ldo1", 400000, 1193750, 6250, BUCK1, 23), + MT6363_LDO_L_SC("vsram-mdfe", VSRAM_MDFE, "vs3-ldo1", 400000, 1193750, 6250, BUCK1, 24), + MT6363_LDO_L_SC("vsram-modem", VSRAM_MODEM, "vs3-ldo2", 400000, 1193750, 6250, BUCK1, 25), + MT6363_LDO_VT("vrf0p9", VRF09, "vs3-ldo2", 1, 18), + MT6363_LDO_VT("vrf12", VRF12, "vs2-ldo1", 3, 19), + MT6363_LDO_VT("vrf13", VRF13, "vs2-ldo1", 1, 20), + MT6363_LDO_VT("vrf18", VRF18, "vs1-ldo1", 3, 21), + MT6363_LDO_VT("vrf-io18", VRFIO18, "vs1-ldo1", 3, 22), + MT6363_LDO_VT("vtref18", VTREF18, "vsys-ldo1", 2, 26), + MT6363_LDO_VT("vufs12", VUFS12, "vs2-ldo1", 4, 33), + MT6363_LDO_VT("vufs18", VUFS18, "vs1-ldo2", 3, 34), +}; + +static int mt6363_backup_op_setting(struct regmap *map, struct mt6363_regulator_info *info) +{ + unsigned int i, val; + int ret; + + ret = regmap_read(map, info->op_en_reg + OP_CFG_OFFSET, &val); + if (ret) + return ret; + + info->orig_op_cfg = val; + + for (i = 0; i < 3; i++) { + ret = regmap_read(map, info->op_en_reg + i, &val); + if (ret) + return ret; + + info->orig_op_en |= val << (i * 8); + } + + return 0; +} + +static void mt6363_irq_remove(void *data) +{ + int *virq = data; + + irq_dispose_mapping(*virq); +} + +static void mt6363_spmi_remove(void *data) +{ + struct spmi_device *sdev = data; + + spmi_device_remove(sdev); +}; + +static struct regmap *mt6363_spmi_register_regmap(struct device *dev) +{ + struct regmap_config mt6363_regmap_config = { + .reg_bits = 16, + .val_bits = 16, + .max_register = 0x1f90, + .fast_io = true, + }; + struct spmi_device *sdev, *sparent; + u32 base; + int ret; + + if (!dev->parent) + return ERR_PTR(-ENODEV); + + ret = device_property_read_u32(dev, "reg", &base); + if (ret) + return ERR_PTR(ret); + + sparent = to_spmi_device(dev->parent); + if (!sparent) + return ERR_PTR(-ENODEV); + + sdev = spmi_device_alloc(sparent->ctrl); + if (!sdev) + return ERR_PTR(-ENODEV); + + sdev->usid = sparent->usid; + dev_set_name(&sdev->dev, "%d-%02x-regulator", sdev->ctrl->nr, sdev->usid); + ret = device_add(&sdev->dev); + if (ret) { + put_device(&sdev->dev); + return ERR_PTR(ret); + }; + + ret = devm_add_action_or_reset(dev, mt6363_spmi_remove, sdev); + if (ret) + return ERR_PTR(ret); + + mt6363_regmap_config.reg_base = base; + + return devm_regmap_init_spmi_ext(sdev, &mt6363_regmap_config); +} + +static int mt6363_regulator_probe(struct platform_device *pdev) +{ + struct device_node *interrupt_parent; + struct regulator_config config = {}; + struct mt6363_regulator_info *info; + struct device *dev = &pdev->dev; + struct regulator_dev *rdev; + struct irq_domain *domain; + struct irq_fwspec fwspec; + struct spmi_device *sdev; + int i, ret; + + config.regmap = mt6363_spmi_register_regmap(dev); + if (IS_ERR(config.regmap)) + return dev_err_probe(dev, PTR_ERR(config.regmap), + "Cannot get regmap\n"); + config.dev = dev; + sdev = to_spmi_device(dev->parent); + + interrupt_parent = of_irq_find_parent(dev->of_node); + if (!interrupt_parent) + return dev_err_probe(dev, -EINVAL, "Cannot find IRQ parent\n"); + + domain = irq_find_host(interrupt_parent); + of_node_put(interrupt_parent); + fwspec.fwnode = domain->fwnode; + + fwspec.param_count = 3; + fwspec.param[0] = sdev->usid; + fwspec.param[2] = IRQ_TYPE_LEVEL_HIGH; + + for (i = 0; i < ARRAY_SIZE(mt6363_regulators); i++) { + info = &mt6363_regulators[i]; + + fwspec.param[1] = info->hwirq; + info->virq = irq_create_fwspec_mapping(&fwspec); + if (!info->virq) + return dev_err_probe(dev, -EINVAL, + "Failed to map IRQ%d\n", info->hwirq); + + ret = devm_add_action_or_reset(dev, mt6363_irq_remove, &info->virq); + if (ret) { + irq_dispose_mapping(info->hwirq); + return ret; + } + + config.driver_data = info; + INIT_DELAYED_WORK(&info->oc_work, mt6363_oc_irq_enable_work); + + rdev = devm_regulator_register(dev, &info->desc, &config); + if (IS_ERR(rdev)) + return dev_err_probe(dev, PTR_ERR(rdev), + "failed to register %s\n", info->desc.name); + + if (info->lp_imax_uA) { + ret = mt6363_backup_op_setting(config.regmap, info); + if (ret) { + dev_warn(dev, "Failed to backup op_setting for %s\n", + info->desc.name); + info->lp_imax_uA = 0; + } + } + } + + return 0; +} + +static const struct of_device_id mt6363_regulator_match[] = { + { .compatible = "mediatek,mt6363-regulator" }, + { /* sentinel */ } +}; + +static struct platform_driver mt6363_regulator_driver = { + .driver = { + .name = "mt6363-regulator", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .of_match_table = mt6363_regulator_match, + }, + .probe = mt6363_regulator_probe, +}; +module_platform_driver(mt6363_regulator_driver); + +MODULE_AUTHOR("AngeloGioacchino Del Regno "); +MODULE_DESCRIPTION("Regulator Driver for MediaTek MT6363 PMIC"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/regulator/mt6363-regulator.h b/include/linux/regulator/mt6363-regulator.h new file mode 100644 index 000000000000..60761f01d3ad --- /dev/null +++ b/include/linux/regulator/mt6363-regulator.h @@ -0,0 +1,330 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 MediaTek Inc. + * Copyright (c) 2025 Collabora Ltd + */ + +#include + +#ifndef __LINUX_REGULATOR_MT6363_H +#define __LINUX_REGULATOR_MT6363_H + +/* Register */ +#define MT6363_TOP_TRAP 0x6 +#define MT6363_TOP_TMA_KEY_L 0x36e +#define MT6363_RG_BUCK0_EN_ADDR 0x210 +#define MT6363_RG_BUCK_VS2_EN_BIT 0 +#define MT6363_RG_BUCK_VBUCK1_EN_BIT 1 +#define MT6363_RG_BUCK_VBUCK2_EN_BIT 2 +#define MT6363_RG_BUCK_VBUCK3_EN_BIT 3 +#define MT6363_RG_BUCK_VBUCK4_EN_BIT 4 +#define MT6363_RG_BUCK_VBUCK5_EN_BIT 5 +#define MT6363_RG_BUCK_VBUCK6_EN_BIT 6 +#define MT6363_RG_BUCK_VBUCK7_EN_BIT 7 +#define MT6363_RG_BUCK1_EN_ADDR 0x213 +#define MT6363_RG_BUCK_VS1_EN_BIT 0 +#define MT6363_RG_BUCK_VS3_EN_BIT 1 +#define MT6363_RG_LDO_VSRAM_DIGRF_EN_BIT 4 +#define MT6363_RG_LDO_VSRAM_MDFE_EN_BIT 5 +#define MT6363_RG_LDO_VSRAM_MODEM_EN_BIT 6 +#define MT6363_RG_BUCK0_LP_ADDR 0x216 +#define MT6363_RG_BUCK_VS2_LP_BIT 0 +#define MT6363_RG_BUCK_VBUCK1_LP_BIT 1 +#define MT6363_RG_BUCK_VBUCK2_LP_BIT 2 +#define MT6363_RG_BUCK_VBUCK3_LP_BIT 3 +#define MT6363_RG_BUCK_VBUCK4_LP_BIT 4 +#define MT6363_RG_BUCK_VBUCK5_LP_BIT 5 +#define MT6363_RG_BUCK_VBUCK6_LP_BIT 6 +#define MT6363_RG_BUCK_VBUCK7_LP_BIT 7 +#define MT6363_RG_BUCK1_LP_ADDR 0x219 +#define MT6363_RG_BUCK_VS1_LP_BIT 0 +#define MT6363_RG_BUCK_VS3_LP_BIT 1 +#define MT6363_RG_LDO_VSRAM_DIGRF_LP_BIT 4 +#define MT6363_RG_LDO_VSRAM_MDFE_LP_BIT 5 +#define MT6363_RG_LDO_VSRAM_MODEM_LP_BIT 6 +#define MT6363_RG_BUCK_VS2_VOSEL_ADDR 0x21c +#define MT6363_RG_BUCK_VS2_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK1_VOSEL_ADDR 0x21d +#define MT6363_RG_BUCK_VBUCK1_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK2_VOSEL_ADDR 0x21e +#define MT6363_RG_BUCK_VBUCK2_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK3_VOSEL_ADDR 0x21f +#define MT6363_RG_BUCK_VBUCK3_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK4_VOSEL_ADDR 0x220 +#define MT6363_RG_BUCK_VBUCK4_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK5_VOSEL_ADDR 0x221 +#define MT6363_RG_BUCK_VBUCK5_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK6_VOSEL_ADDR 0x222 +#define MT6363_RG_BUCK_VBUCK6_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK7_VOSEL_ADDR 0x223 +#define MT6363_RG_BUCK_VBUCK7_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VS1_VOSEL_ADDR 0x224 +#define MT6363_RG_BUCK_VS1_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VS3_VOSEL_ADDR 0x225 +#define MT6363_RG_BUCK_VS3_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_LDO_VSRAM_DIGRF_VOSEL_ADDR 0x228 +#define MT6363_RG_LDO_VSRAM_DIGRF_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_LDO_VSRAM_MDFE_VOSEL_ADDR 0x229 +#define MT6363_RG_LDO_VSRAM_MDFE_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_LDO_VSRAM_MODEM_VOSEL_ADDR 0x22a +#define MT6363_RG_LDO_VSRAM_MODEM_VOSEL_MASK GENMASK(6, 0) +#define MT6363_BUCK_TOP_KEY_PROT_LO 0x13fa +#define MT6363_BUCK_VS2_WDTDBG_VOSEL_ADDR 0x13fc +#define MT6363_BUCK_VBUCK1_WDTDBG_VOSEL_ADDR 0x13fd +#define MT6363_BUCK_VBUCK2_WDTDBG_VOSEL_ADDR 0x13fe +#define MT6363_BUCK_VBUCK3_WDTDBG_VOSEL_ADDR 0x13ff +#define MT6363_BUCK_VBUCK4_WDTDBG_VOSEL_ADDR 0x1400 +#define MT6363_BUCK_VBUCK5_WDTDBG_VOSEL_ADDR 0x1401 +#define MT6363_BUCK_VBUCK6_WDTDBG_VOSEL_ADDR 0x1402 +#define MT6363_BUCK_VBUCK7_WDTDBG_VOSEL_ADDR 0x1403 +#define MT6363_BUCK_VS1_WDTDBG_VOSEL_ADDR 0x1404 +#define MT6363_BUCK_VS3_WDTDBG_VOSEL_ADDR 0x1405 +#define MT6363_RG_BUCK_EFUSE_RSV1 0x1417 +#define MT6363_RG_BUCK_EFUSE_RSV1_MASK GENMASK(7, 4) +#define MT6363_BUCK_VS2_OP_EN_0 0x145d +#define MT6363_BUCK_VS2_HW_LP_MODE 0x1468 +#define MT6363_BUCK_VBUCK1_OP_EN_0 0x14dd +#define MT6363_BUCK_VBUCK1_HW_LP_MODE 0x14e8 +#define MT6363_RG_BUCK_VBUCK1_SSHUB_EN_ADDR 0x14ea +#define MT6363_RG_BUCK_VBUCK1_SSHUB_VOSEL_ADDR 0x14eb +#define MT6363_RG_BUCK_VBUCK1_SSHUB_VOSEL_MASK GENMASK(7, 0) +#define MT6363_BUCK_VBUCK2_OP_EN_0 0x155d +#define MT6363_BUCK_VBUCK2_HW_LP_MODE 0x1568 +#define MT6363_RG_BUCK_VBUCK2_SSHUB_EN_ADDR 0x156a +#define MT6363_RG_BUCK_VBUCK2_SSHUB_VOSEL_ADDR 0x156b +#define MT6363_RG_BUCK_VBUCK2_SSHUB_VOSEL_MASK GENMASK(7, 0) +#define MT6363_BUCK_VBUCK3_OP_EN_0 0x15dd +#define MT6363_BUCK_VBUCK3_HW_LP_MODE 0x15e8 +#define MT6363_BUCK_VBUCK4_OP_EN_0 0x165d +#define MT6363_BUCK_VBUCK4_HW_LP_MODE 0x1668 +#define MT6363_RG_BUCK_VBUCK4_SSHUB_EN_ADDR 0x166a +#define MT6363_RG_BUCK_VBUCK4_SSHUB_VOSEL_ADDR 0x166b +#define MT6363_RG_BUCK_VBUCK4_SSHUB_VOSEL_MASK GENMASK(7, 0) +#define MT6363_BUCK_VBUCK5_OP_EN_0 0x16dd +#define MT6363_BUCK_VBUCK5_HW_LP_MODE 0x16e8 +#define MT6363_BUCK_VBUCK6_OP_EN_0 0x175d +#define MT6363_BUCK_VBUCK6_HW_LP_MODE 0x1768 +#define MT6363_BUCK_VBUCK7_OP_EN_0 0x17dd +#define MT6363_BUCK_VBUCK7_HW_LP_MODE 0x17e8 +#define MT6363_BUCK_VS1_OP_EN_0 0x185d +#define MT6363_BUCK_VS1_HW_LP_MODE 0x1868 +#define MT6363_BUCK_VS3_OP_EN_0 0x18dd +#define MT6363_BUCK_VS3_HW_LP_MODE 0x18e8 +#define MT6363_RG_VS1_FCCM_ADDR 0x1964 +#define MT6363_RG_VS1_FCCM_BIT 0 +#define MT6363_RG_VS3_FCCM_ADDR 0x1973 +#define MT6363_RG_VS3_FCCM_BIT 0 +#define MT6363_RG_BUCK0_FCCM_ADDR 0x1a02 +#define MT6363_RG_VBUCK1_FCCM_BIT 0 +#define MT6363_RG_VBUCK2_FCCM_BIT 1 +#define MT6363_RG_VBUCK3_FCCM_BIT 2 +#define MT6363_RG_VS2_FCCM_BIT 3 +#define MT6363_RG_BUCK0_1_FCCM_ADDR 0x1a82 +#define MT6363_RG_VBUCK4_FCCM_BIT 0 +#define MT6363_RG_VBUCK5_FCCM_BIT 1 +#define MT6363_RG_VBUCK6_FCCM_BIT 2 +#define MT6363_RG_VBUCK7_FCCM_BIT 3 +#define MT6363_RG_VCN13_VOSEL_ADDR 0x1b0f +#define MT6363_RG_VCN13_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VEMC_VOSEL_ADDR 0x1b10 +#define MT6363_RG_VEMC_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VEMC_VOSEL_1_MASK GENMASK(7, 4) +#define MT6363_RG_LDO_VSRAM_CPUB_VOSEL_ADDR 0x1b14 +#define MT6363_RG_LDO_VSRAM_CPUB_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_LDO_VSRAM_CPUM_VOSEL_ADDR 0x1b15 +#define MT6363_RG_LDO_VSRAM_CPUM_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_LDO_VSRAM_CPUL_VOSEL_ADDR 0x1b16 +#define MT6363_RG_LDO_VSRAM_CPUL_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_LDO_VSRAM_APU_VOSEL_ADDR 0x1b17 +#define MT6363_RG_LDO_VSRAM_APU_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_VEMC_VOCAL_ADDR 0x1b1b +#define MT6363_RG_VEMC_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_LDO_VCN15_ADDR 0x1b57 +#define MT6363_RG_LDO_VCN15_EN_BIT 0 +#define MT6363_RG_LDO_VCN15_LP_BIT 1 +#define MT6363_LDO_VCN15_HW_LP_MODE 0x1b5b +#define MT6363_LDO_VCN15_OP_EN0 0x1b5c +#define MT6363_RG_LDO_VRF09_ADDR 0x1b65 +#define MT6363_RG_LDO_VRF09_EN_BIT 0 +#define MT6363_RG_LDO_VRF09_LP_BIT 1 +#define MT6363_LDO_VRF09_HW_LP_MODE 0x1b69 +#define MT6363_LDO_VRF09_OP_EN0 0x1b6a +#define MT6363_RG_LDO_VRF12_ADDR 0x1b73 +#define MT6363_RG_LDO_VRF12_EN_BIT 0 +#define MT6363_RG_LDO_VRF12_LP_BIT 1 +#define MT6363_LDO_VRF12_HW_LP_MODE 0x1b77 +#define MT6363_LDO_VRF12_OP_EN0 0x1b78 +#define MT6363_RG_LDO_VRF13_ADDR 0x1b81 +#define MT6363_RG_LDO_VRF13_EN_BIT 0 +#define MT6363_RG_LDO_VRF13_LP_BIT 1 +#define MT6363_LDO_VRF13_HW_LP_MODE 0x1b85 +#define MT6363_LDO_VRF13_OP_EN0 0x1b86 +#define MT6363_RG_LDO_VRF18_ADDR 0x1b8f +#define MT6363_RG_LDO_VRF18_EN_BIT 0 +#define MT6363_RG_LDO_VRF18_LP_BIT 1 +#define MT6363_LDO_VRF18_HW_LP_MODE 0x1b93 +#define MT6363_LDO_VRF18_OP_EN0 0x1b94 +#define MT6363_RG_LDO_VRFIO18_ADDR 0x1b9d +#define MT6363_RG_LDO_VRFIO18_EN_BIT 0 +#define MT6363_RG_LDO_VRFIO18_LP_BIT 1 +#define MT6363_LDO_VRFIO18_HW_LP_MODE 0x1ba1 +#define MT6363_LDO_VRFIO18_OP_EN0 0x1ba2 +#define MT6363_RG_LDO_VTREF18_ADDR 0x1bd7 +#define MT6363_RG_LDO_VTREF18_EN_BIT 0 +#define MT6363_RG_LDO_VTREF18_LP_BIT 1 +#define MT6363_LDO_VTREF18_HW_LP_MODE 0x1bdb +#define MT6363_LDO_VTREF18_OP_EN0 0x1bdc +#define MT6363_RG_LDO_VAUX18_ADDR 0x1be5 +#define MT6363_RG_LDO_VAUX18_EN_BIT 0 +#define MT6363_RG_LDO_VAUX18_LP_BIT 1 +#define MT6363_LDO_VAUX18_HW_LP_MODE 0x1be9 +#define MT6363_LDO_VAUX18_OP_EN0 0x1bea +#define MT6363_RG_LDO_VEMC_ADDR 0x1bf3 +#define MT6363_RG_LDO_VEMC_EN_BIT 0 +#define MT6363_RG_LDO_VEMC_LP_BIT 1 +#define MT6363_LDO_VEMC_HW_LP_MODE 0x1bf7 +#define MT6363_LDO_VEMC_OP_EN0 0x1bf8 +#define MT6363_RG_LDO_VUFS12_ADDR 0x1c01 +#define MT6363_RG_LDO_VUFS12_EN_BIT 0 +#define MT6363_RG_LDO_VUFS12_LP_BIT 1 +#define MT6363_LDO_VUFS12_HW_LP_MODE 0x1c05 +#define MT6363_LDO_VUFS12_OP_EN0 0x1c06 +#define MT6363_RG_LDO_VUFS18_ADDR 0x1c0f +#define MT6363_RG_LDO_VUFS18_EN_BIT 0 +#define MT6363_RG_LDO_VUFS18_LP_BIT 1 +#define MT6363_LDO_VUFS18_HW_LP_MODE 0x1c13 +#define MT6363_LDO_VUFS18_OP_EN0 0x1c14 +#define MT6363_RG_LDO_VIO18_ADDR 0x1c1d +#define MT6363_RG_LDO_VIO18_EN_BIT 0 +#define MT6363_RG_LDO_VIO18_LP_BIT 1 +#define MT6363_LDO_VIO18_HW_LP_MODE 0x1c21 +#define MT6363_LDO_VIO18_OP_EN0 0x1c22 +#define MT6363_RG_LDO_VIO075_ADDR 0x1c57 +#define MT6363_RG_LDO_VIO075_EN_BIT 0 +#define MT6363_RG_LDO_VIO075_LP_BIT 1 +#define MT6363_LDO_VIO075_HW_LP_MODE 0x1c5b +#define MT6363_LDO_VIO075_OP_EN0 0x1c5c +#define MT6363_RG_LDO_VA12_1_ADDR 0x1c65 +#define MT6363_RG_LDO_VA12_1_EN_BIT 0 +#define MT6363_RG_LDO_VA12_1_LP_BIT 1 +#define MT6363_LDO_VA12_1_HW_LP_MODE 0x1c69 +#define MT6363_LDO_VA12_1_OP_EN0 0x1c6a +#define MT6363_RG_LDO_VA12_2_ADDR 0x1c73 +#define MT6363_RG_LDO_VA12_2_EN_BIT 0 +#define MT6363_RG_LDO_VA12_2_LP_BIT 1 +#define MT6363_LDO_VA12_2_HW_LP_MODE 0x1c77 +#define MT6363_LDO_VA12_2_OP_EN0 0x1c78 +#define MT6363_RG_LDO_VA15_ADDR 0x1c81 +#define MT6363_RG_LDO_VA15_EN_BIT 0 +#define MT6363_RG_LDO_VA15_LP_BIT 1 +#define MT6363_LDO_VA15_HW_LP_MODE 0x1c85 +#define MT6363_LDO_VA15_OP_EN0 0x1c86 +#define MT6363_RG_LDO_VM18_ADDR 0x1c8f +#define MT6363_RG_LDO_VM18_EN_BIT 0 +#define MT6363_RG_LDO_VM18_LP_BIT 1 +#define MT6363_LDO_VM18_HW_LP_MODE 0x1c93 +#define MT6363_LDO_VM18_OP_EN0 0x1c94 +#define MT6363_RG_LDO_VCN13_ADDR 0x1cd7 +#define MT6363_RG_LDO_VCN13_EN_BIT 0 +#define MT6363_RG_LDO_VCN13_LP_BIT 1 +#define MT6363_LDO_VCN13_HW_LP_MODE 0x1cdb +#define MT6363_LDO_VCN13_OP_EN0 0x1ce4 +#define MT6363_LDO_VSRAM_DIGRF_HW_LP_MODE 0x1cf1 +#define MT6363_LDO_VSRAM_DIGRF_OP_EN0 0x1cfa +#define MT6363_LDO_VSRAM_MDFE_HW_LP_MODE 0x1d5b +#define MT6363_LDO_VSRAM_MDFE_OP_EN0 0x1d64 +#define MT6363_LDO_VSRAM_MODEM_HW_LP_MODE 0x1d76 +#define MT6363_LDO_VSRAM_MODEM_OP_EN0 0x1d7f +#define MT6363_RG_LDO_VSRAM_CPUB_ADDR 0x1dd7 +#define MT6363_RG_LDO_VSRAM_CPUB_EN_BIT 0 +#define MT6363_RG_LDO_VSRAM_CPUB_LP_BIT 1 +#define MT6363_LDO_VSRAM_CPUB_HW_LP_MODE 0x1ddb +#define MT6363_LDO_VSRAM_CPUB_OP_EN0 0x1de4 +#define MT6363_RG_LDO_VSRAM_CPUM_ADDR 0x1ded +#define MT6363_RG_LDO_VSRAM_CPUM_EN_BIT 0 +#define MT6363_RG_LDO_VSRAM_CPUM_LP_BIT 1 +#define MT6363_LDO_VSRAM_CPUM_HW_LP_MODE 0x1df1 +#define MT6363_LDO_VSRAM_CPUM_OP_EN0 0x1dfa +#define MT6363_RG_LDO_VSRAM_CPUL_ADDR 0x1e57 +#define MT6363_RG_LDO_VSRAM_CPUL_EN_BIT 0 +#define MT6363_RG_LDO_VSRAM_CPUL_LP_BIT 1 +#define MT6363_LDO_VSRAM_CPUL_HW_LP_MODE 0x1e5b +#define MT6363_LDO_VSRAM_CPUL_OP_EN0 0x1e64 +#define MT6363_RG_LDO_VSRAM_APU_ADDR 0x1e6d +#define MT6363_RG_LDO_VSRAM_APU_EN_BIT 0 +#define MT6363_RG_LDO_VSRAM_APU_LP_BIT 1 +#define MT6363_LDO_VSRAM_APU_HW_LP_MODE 0x1e71 +#define MT6363_LDO_VSRAM_APU_OP_EN0 0x1e7a +#define MT6363_RG_VTREF18_VOCAL_ADDR 0x1ed8 +#define MT6363_RG_VTREF18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VTREF18_VOSEL_ADDR 0x1ed9 +#define MT6363_RG_VTREF18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VAUX18_VOCAL_ADDR 0x1edc +#define MT6363_RG_VAUX18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VAUX18_VOSEL_ADDR 0x1edd +#define MT6363_RG_VAUX18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VCN15_VOCAL_ADDR 0x1ee3 +#define MT6363_RG_VCN15_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VCN15_VOSEL_ADDR 0x1ee4 +#define MT6363_RG_VCN15_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VUFS18_VOCAL_ADDR 0x1ee7 +#define MT6363_RG_VUFS18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VUFS18_VOSEL_ADDR 0x1ee8 +#define MT6363_RG_VUFS18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VIO18_VOCAL_ADDR 0x1eeb +#define MT6363_RG_VIO18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VIO18_VOSEL_ADDR 0x1eec +#define MT6363_RG_VIO18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VM18_VOCAL_ADDR 0x1eef +#define MT6363_RG_VM18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VM18_VOSEL_ADDR 0x1ef0 +#define MT6363_RG_VM18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VA15_VOCAL_ADDR 0x1ef3 +#define MT6363_RG_VA15_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VA15_VOSEL_ADDR 0x1ef4 +#define MT6363_RG_VA15_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF18_VOCAL_ADDR 0x1ef7 +#define MT6363_RG_VRF18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF18_VOSEL_ADDR 0x1ef8 +#define MT6363_RG_VRF18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VRFIO18_VOCAL_ADDR 0x1efb +#define MT6363_RG_VRFIO18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VRFIO18_VOSEL_ADDR 0x1efc +#define MT6363_RG_VRFIO18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VIO075_VOCFG_ADDR 0x1f01 +#define MT6363_RG_VIO075_VOCAL_ADDR MT6363_RG_VIO075_VOCFG_ADDR +#define MT6363_RG_VIO075_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VIO075_VOSEL_ADDR MT6363_RG_VIO075_VOCFG_ADDR +#define MT6363_RG_VIO075_VOSEL_MASK GENMASK(6, 4) +#define MT6363_RG_VCN13_VOCAL_ADDR 0x1f58 +#define MT6363_RG_VCN13_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VUFS12_VOCAL_ADDR 0x1f61 +#define MT6363_RG_VUFS12_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VUFS12_VOSEL_ADDR 0x1f62 +#define MT6363_RG_VUFS12_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VA12_1_VOCAL_ADDR 0x1f65 +#define MT6363_RG_VA12_1_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VA12_1_VOSEL_ADDR 0x1f66 +#define MT6363_RG_VA12_1_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VA12_2_VOCAL_ADDR 0x1f69 +#define MT6363_RG_VA12_2_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VA12_2_VOSEL_ADDR 0x1f6a +#define MT6363_RG_VA12_2_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF12_VOCAL_ADDR 0x1f6d +#define MT6363_RG_VRF12_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF12_VOSEL_ADDR 0x1f6e +#define MT6363_RG_VRF12_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF13_VOCAL_ADDR 0x1f71 +#define MT6363_RG_VRF13_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF13_VOSEL_ADDR 0x1f72 +#define MT6363_RG_VRF13_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF09_VOCAL_ADDR 0x1f78 +#define MT6363_RG_VRF09_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF09_VOSEL_ADDR 0x1f79 +#define MT6363_RG_VRF09_VOSEL_MASK GENMASK(3, 0) +#define MT6363_ISINK_EN_CTRL0 0x21db +#define MT6363_ISINK_CTRL0_MASK GENMASK(7, 0) +#define MT6363_ISINK_EN_CTRL1 0x21dc +#define MT6363_ISINK_CTRL1_MASK GENMASK(7, 4) + +#endif /* __LINUX_REGULATOR_MT6363_H */ -- cgit v1.2.3 From fdb9aed869f34d776298b3a8197909eb820e4d0d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:38 +0900 Subject: block: introduce disk_report_zone() Commit b76b840fd933 ("dm: Fix dm-zoned-reclaim zone write pointer alignment") introduced an indirect call for the callback function of a report zones executed with blkdev_report_zones(). This is necessary so that the function disk_zone_wplug_sync_wp_offset() can be called to refresh a zone write plug zone write pointer offset after a write error. However, this solution makes following the path of a zone information harder to understand. Clean this up by introducing the new blk_report_zones_args structure to define a zone report callback and its private data and introduce the helper function disk_report_zone() which calls both disk_zone_wplug_sync_wp_offset() and the zone report user callback function for all zones of a zone report. This helper function must be called by all block device drivers that implement the report zones block operation in order to correctly report a zone information. All block device drivers supporting the report_zones block operation are updated to use this new scheme. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 79 +++++++++++++++++++++------------------ drivers/block/null_blk/null_blk.h | 3 +- drivers/block/null_blk/zoned.c | 4 +- drivers/block/ublk_drv.c | 4 +- drivers/block/virtio_blk.c | 11 +++--- drivers/block/zloop.c | 4 +- drivers/md/dm-zone.c | 54 ++++++++++++++------------ drivers/md/dm.h | 3 +- drivers/nvme/host/core.c | 5 +-- drivers/nvme/host/multipath.c | 4 +- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/zns.c | 10 ++--- drivers/scsi/sd.h | 2 +- drivers/scsi/sd_zbc.c | 20 ++++------ include/linux/blkdev.h | 7 +++- include/linux/device-mapper.h | 10 ++++- 16 files changed, 120 insertions(+), 102 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 345a99c0b031..de3524c17f67 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -114,30 +114,16 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -struct disk_report_zones_cb_args { - struct gendisk *disk; - report_zones_cb user_cb; - void *user_data; +/* + * Zone report arguments for block device drivers report_zones operation. + * @cb: report_zones_cb callback for each reported zone. + * @data: Private data passed to report_zones_cb. + */ +struct blk_report_zones_args { + report_zones_cb cb; + void *data; }; -static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, - struct blk_zone *zone); - -static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - struct disk_report_zones_cb_args *args = data; - struct gendisk *disk = args->disk; - - if (disk->zone_wplugs_hash) - disk_zone_wplug_sync_wp_offset(disk, zone); - - if (!args->user_cb) - return 0; - - return args->user_cb(zone, idx, args->user_data); -} - /** * blkdev_report_zones - Get zones information * @bdev: Target block device @@ -161,10 +147,9 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct gendisk *disk = bdev->bd_disk; - struct disk_report_zones_cb_args args = { - .disk = disk, - .user_cb = cb, - .user_data = data, + struct blk_report_zones_args args = { + .cb = cb, + .data = data, }; if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) @@ -173,8 +158,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, if (!nr_zones || sector >= get_capacity(disk)) return 0; - return disk->fops->report_zones(disk, sector, nr_zones, - disk_report_zones_cb, &args); + return disk->fops->report_zones(disk, sector, nr_zones, &args); } EXPORT_SYMBOL_GPL(blkdev_report_zones); @@ -692,15 +676,32 @@ static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, disk_put_zone_wplug(zwplug); } -static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector) +/** + * disk_report_zone - Report one zone + * @disk: Target disk + * @zone: The zone to report + * @idx: The index of the zone in the overall zone report + * @args: report zones callback and data + * + * Description: + * Helper function for block device drivers to report one zone of a zone + * report initiated with blkdev_report_zones(). The zone being reported is + * specified by @zone and used to update, if necessary, the zone write plug + * information for the zone. If @args specifies a user callback function, + * this callback is executed. + */ +int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, + unsigned int idx, struct blk_report_zones_args *args) { - struct disk_report_zones_cb_args args = { - .disk = disk, - }; + if (disk->zone_wplugs_hash) + disk_zone_wplug_sync_wp_offset(disk, zone); + + if (args && args->cb) + return args->cb(zone, idx, args->data); - return disk->fops->report_zones(disk, sector, 1, - disk_report_zones_cb, &args); + return 0; } +EXPORT_SYMBOL_GPL(disk_report_zone); static void blk_zone_reset_bio_endio(struct bio *bio) { @@ -1786,6 +1787,10 @@ int blk_revalidate_disk_zones(struct gendisk *disk) sector_t capacity = get_capacity(disk); struct blk_revalidate_zone_args args = { }; unsigned int memflags, noio_flag; + struct blk_report_zones_args rep_args = { + .cb = blk_revalidate_zone_cb, + .data = &args, + }; int ret = -ENOMEM; if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) @@ -1817,8 +1822,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) return ret; } - ret = disk->fops->report_zones(disk, 0, UINT_MAX, - blk_revalidate_zone_cb, &args); + ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args); if (!ret) { pr_warn("%s: No zones reported\n", disk->disk_name); ret = -ENODEV; @@ -1863,6 +1867,7 @@ EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask) { + struct gendisk *disk = bdev->bd_disk; int ret; if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) @@ -1878,7 +1883,7 @@ int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, * pointer. Undo this using a report zone to update the zone write * pointer to the correct current value. */ - ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); + ret = disk->fops->report_zones(disk, sector, 1, NULL); if (ret != 1) return ret < 0 ? ret : -EIO; diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 7bb6128dbaaf..6c4c4bbe7dad 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -143,7 +143,8 @@ int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim); int null_register_zoned_dev(struct nullb *nullb); void null_free_zoned_dev(struct nullb_device *dev); int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, sector_t nr_sectors); size_t null_zone_valid_read_len(struct nullb *nullb, diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 4e5728f45989..6a93b12a06ff 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -191,7 +191,7 @@ void null_free_zoned_dev(struct nullb_device *dev) } int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct nullb *nullb = disk->private_data; struct nullb_device *dev = nullb->dev; @@ -225,7 +225,7 @@ int null_report_zones(struct gendisk *disk, sector_t sector, blkz.capacity = zone->capacity; null_unlock_zone(dev, zone); - error = cb(&blkz, i, data); + error = disk_report_zone(disk, &blkz, i, args); if (error) return error; } diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 96e07763cd28..97cc4bc0a6ce 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -367,7 +367,7 @@ static void *ublk_alloc_report_buffer(struct ublk_device *ublk, } static int ublk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct ublk_device *ub = disk->private_data; unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; @@ -430,7 +430,7 @@ free_req: if (!zone->len) break; - ret = cb(zone, i, data); + ret = disk_report_zone(disk, zone, i, args); if (ret) goto out; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index f061420dfb10..a5e97f03dbf0 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -584,7 +584,8 @@ out: static int virtblk_parse_zone(struct virtio_blk *vblk, struct virtio_blk_zone_descriptor *entry, - unsigned int idx, report_zones_cb cb, void *data) + unsigned int idx, + struct blk_report_zones_args *args) { struct blk_zone zone = { }; @@ -650,12 +651,12 @@ static int virtblk_parse_zone(struct virtio_blk *vblk, * The callback below checks the validity of the reported * entry data, no need to further validate it here. */ - return cb(&zone, idx, data); + return disk_report_zone(vblk->disk, &zone, idx, args); } static int virtblk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, - void *data) + unsigned int nr_zones, + struct blk_report_zones_args *args) { struct virtio_blk *vblk = disk->private_data; struct virtio_blk_zone_report *report; @@ -693,7 +694,7 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector, for (i = 0; i < nz && zone_idx < nr_zones; i++) { ret = virtblk_parse_zone(vblk, &report->zones[i], - zone_idx, cb, data); + zone_idx, args); if (ret) goto fail_report; diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index a423228e201b..92be9f0af00a 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -647,7 +647,7 @@ static int zloop_open(struct gendisk *disk, blk_mode_t mode) } static int zloop_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct zloop_device *zlo = disk->private_data; struct blk_zone blkz = {}; @@ -687,7 +687,7 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector, mutex_unlock(&zone->lock); - ret = cb(&blkz, i, data); + ret = disk_report_zone(disk, &blkz, i, args); if (ret) return ret; } diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 78e17dd4d01b..984fb621b0e9 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -17,33 +17,26 @@ * For internal zone reports bypassing the top BIO submission path. */ static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, - sector_t sector, unsigned int nr_zones, - report_zones_cb cb, void *data) + unsigned int nr_zones, + struct dm_report_zones_args *args) { - struct gendisk *disk = md->disk; - int ret; - struct dm_report_zones_args args = { - .next_sector = sector, - .orig_data = data, - .orig_cb = cb, - }; - do { struct dm_target *tgt; + int ret; - tgt = dm_table_find_target(t, args.next_sector); + tgt = dm_table_find_target(t, args->next_sector); if (WARN_ON_ONCE(!tgt->type->report_zones)) return -EIO; - args.tgt = tgt; - ret = tgt->type->report_zones(tgt, &args, - nr_zones - args.zone_idx); + args->tgt = tgt; + ret = tgt->type->report_zones(tgt, args, + nr_zones - args->zone_idx); if (ret < 0) return ret; - } while (args.zone_idx < nr_zones && - args.next_sector < get_capacity(disk)); + } while (args->zone_idx < nr_zones && + args->next_sector < get_capacity(md->disk)); - return args.zone_idx; + return args->zone_idx; } /* @@ -52,7 +45,8 @@ static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, * generally implemented by targets using dm_report_zones(). */ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, + struct blk_report_zones_args *args) { struct mapped_device *md = disk->private_data; struct dm_table *map; @@ -76,9 +70,14 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, map = zone_revalidate_map; } - if (map) - ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, - data); + if (map) { + struct dm_report_zones_args dm_args = { + .disk = md->disk, + .next_sector = sector, + .rep_args = args, + }; + ret = dm_blk_do_report_zones(md, map, nr_zones, &dm_args); + } if (put_table) dm_put_live_table(md, srcu_idx); @@ -113,7 +112,9 @@ static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, } args->next_sector = zone->start + zone->len; - return args->orig_cb(zone, args->zone_idx++, args->orig_data); + + return disk_report_zone(args->disk, zone, args->zone_idx++, + args->rep_args); } /* @@ -492,10 +493,15 @@ int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, sector_t sector, unsigned int nr_zones, unsigned long *need_reset) { + struct dm_report_zones_args args = { + .disk = md->disk, + .next_sector = sector, + .cb = dm_zone_need_reset_cb, + .data = need_reset, + }; int ret; - ret = dm_blk_do_report_zones(md, t, sector, nr_zones, - dm_zone_need_reset_cb, need_reset); + ret = dm_blk_do_report_zones(md, t, nr_zones, &args); if (ret != nr_zones) { DMERR("Get %s zone reset bitmap failed\n", md->disk->disk_name); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 245f52b59215..7a795979ec72 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -109,7 +109,8 @@ void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, sector_t sector, unsigned int nr_zones, diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fa4181d7de73..c0fe50fb7b08 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2599,10 +2599,9 @@ static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended) #ifdef CONFIG_BLK_DEV_ZONED static int nvme_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { - return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb, - data); + return nvme_ns_report_zones(disk->private_data, sector, nr_zones, args); } #else #define nvme_report_zones NULL diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 543e17aead12..0b7ac0735bd0 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -576,7 +576,7 @@ static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16], #ifdef CONFIG_BLK_DEV_ZONED static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct nvme_ns_head *head = disk->private_data; struct nvme_ns *ns; @@ -585,7 +585,7 @@ static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); if (ns) - ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); + ret = nvme_ns_report_zones(ns, sector, nr_zones, args); srcu_read_unlock(&head->srcu, srcu_idx); return ret; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 102fae6a231c..928c748ccbd1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1108,7 +1108,7 @@ struct nvme_zone_info { }; int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, struct blk_report_zones_args *args); int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf, struct nvme_zone_info *zi); void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index cce4c5b55aa9..deea2dbef5b8 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -148,8 +148,8 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, static int nvme_zone_parse_entry(struct nvme_ns *ns, struct nvme_zone_descriptor *entry, - unsigned int idx, report_zones_cb cb, - void *data) + unsigned int idx, + struct blk_report_zones_args *args) { struct nvme_ns_head *head = ns->head; struct blk_zone zone = { }; @@ -169,11 +169,11 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns, else zone.wp = nvme_lba_to_sect(head, le64_to_cpu(entry->wp)); - return cb(&zone, idx, data); + return disk_report_zone(ns->disk, &zone, idx, args); } int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct nvme_zone_report *report; struct nvme_command c = { }; @@ -213,7 +213,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, for (i = 0; i < nz && zone_idx < nr_zones; i++) { ret = nvme_zone_parse_entry(ns, &report->entries[i], - zone_idx, cb, data); + zone_idx, args); if (ret) goto out_free; zone_idx++; diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 36382eca941c..574af8243016 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -240,7 +240,7 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, struct blk_report_zones_args *args); #else /* CONFIG_BLK_DEV_ZONED */ diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index a8db66428f80..56e455fb5add 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -35,8 +35,7 @@ static bool sd_zbc_is_gap_zone(const u8 buf[64]) * @buf: SCSI zone descriptor. * @idx: Index of the zone relative to the first zone reported by the current * sd_zbc_report_zones() call. - * @cb: Callback function pointer. - * @data: Second argument passed to @cb. + * @args: report zones arguments (callback, etc) * * Return: Value returned by @cb. * @@ -44,12 +43,11 @@ static bool sd_zbc_is_gap_zone(const u8 buf[64]) * call @cb(blk_zone, @data). */ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64], - unsigned int idx, report_zones_cb cb, void *data) + unsigned int idx, struct blk_report_zones_args *args) { struct scsi_device *sdp = sdkp->device; struct blk_zone zone = { 0 }; sector_t start_lba, gran; - int ret; if (WARN_ON_ONCE(sd_zbc_is_gap_zone(buf))) return -EINVAL; @@ -87,11 +85,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64], else zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); - ret = cb(&zone, idx, data); - if (ret) - return ret; - - return 0; + return disk_report_zone(sdkp->disk, &zone, idx, args); } /** @@ -217,14 +211,14 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp) * @disk: Disk to report zones for. * @sector: Start sector. * @nr_zones: Maximum number of zones to report. - * @cb: Callback function called to report zone information. - * @data: Second argument passed to @cb. + * @args: Callback arguments. * * Called by the block layer to iterate over zone information. See also the * disk->fops->report_zones() calls in block/blk-zoned.c. */ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, + struct blk_report_zones_args *args) { struct scsi_disk *sdkp = scsi_disk(disk); sector_t lba = sectors_to_logical(sdkp->device, sector); @@ -283,7 +277,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, } ret = sd_zbc_parse_report(sdkp, buf + offset, zone_idx, - cb, data); + args); if (ret) goto out; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99be263b31ab..2f75fb15f55f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -38,6 +38,7 @@ struct blk_flush_queue; struct kiocb; struct pr_ops; struct rq_qos; +struct blk_report_zones_args; struct blk_queue_stats; struct blk_stat_callback; struct blk_crypto_profile; @@ -432,6 +433,9 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); +int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, + unsigned int idx, struct blk_report_zones_args *args); + #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -1662,7 +1666,8 @@ struct block_device_operations { /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 84fdc3a6a19a..38f625af6ab4 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -538,12 +538,18 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone); #ifdef CONFIG_BLK_DEV_ZONED struct dm_report_zones_args { struct dm_target *tgt; + struct gendisk *disk; sector_t next_sector; - void *orig_data; - report_zones_cb orig_cb; unsigned int zone_idx; + /* for block layer ->report_zones */ + struct blk_report_zones_args *rep_args; + + /* for internal users */ + report_zones_cb cb; + void *data; + /* must be filled by ->report_zones before calling dm_report_zones_cb */ sector_t start; }; -- cgit v1.2.3 From 6e945ffb6555705cf20b1fcdc21a139911562995 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:40 +0900 Subject: block: use zone condition to determine conventional zones The conv_zones_bitmap field of struct gendisk is used to define a bitmap to identify the conventional zones of a zoned block device. The bit for a zone is set in this bitmap if the zone is a conventional one, that is, if the zone type is BLK_ZONE_TYPE_CONVENTIONAL. For such zone, this always corresponds to the zone condition BLK_ZONE_COND_NOT_WP. In other words, conv_zones_bitmap tracks a single condition of the zones of a zoned block device. In preparation for tracking more zone conditions, change conv_zones_bitmap into an array of zone conditions, using 1 byte per zone. This increases the memory usage from 1 bit per zone to 1 byte per zone, that is, from 16 KiB to about 100 KiB for a 30 TB SMR HDD with 256 MiB zones. This is a trade-off to allow fast cached report zones later on top of this change. Rename the conv_zones_bitmap field of struct gendisk to zones_cond. Add a blk_revalidate_zone_cond() function to initialize the zones_cond array of a disk during device scan and to update it on device revalidation. Move the allocation of the zones_cond array to disk_revalidate_zone_resources(), making sure that this array is always allocated, even for devices that do not need zone write plugs (zone resources), to ensure that bdev_zone_is_seq() can be re-implemented to use the zone condition array in place of the conv zones bitmap. Finally, the function bdev_zone_is_seq() is rewritten to use a test on the condition of the target zone. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 153 ++++++++++++++++++++++++++++++++++--------------- include/linux/blkdev.h | 37 +++--------- 2 files changed, 117 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index d4fc87b0be6b..f62862274f9a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -114,6 +114,33 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); +/** + * bdev_zone_is_seq - check if a sector belongs to a sequential write zone + * @bdev: block device to check + * @sector: sector number + * + * Check if @sector on @bdev is contained in a sequential write required zone. + */ +bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) +{ + struct gendisk *disk = bdev->bd_disk; + unsigned int zno = disk_zone_no(disk, sector); + bool is_seq = false; + u8 *zones_cond; + + if (!bdev_is_zoned(bdev)) + return false; + + rcu_read_lock(); + zones_cond = rcu_dereference(disk->zones_cond); + if (zones_cond && zno < disk->nr_zones) + is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP; + rcu_read_unlock(); + + return is_seq; +} +EXPORT_SYMBOL_GPL(bdev_zone_is_seq); + /* * Zone report arguments for block device drivers report_zones operation. * @cb: report_zones_cb callback for each reported zone. @@ -1458,22 +1485,16 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) disk->zone_wplugs_hash_bits = 0; } -static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk, - unsigned long *bitmap) +static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) { - unsigned int nr_conv_zones = 0; unsigned long flags; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - if (bitmap) - nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); - bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, - lockdep_is_held(&disk->zone_wplugs_lock)); + zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, + lockdep_is_held(&disk->zone_wplugs_lock)); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); - kfree_rcu_mightsleep(bitmap); - - return nr_conv_zones; + kfree_rcu_mightsleep(zones_cond); } void disk_free_zone_resources(struct gendisk *disk) @@ -1497,7 +1518,7 @@ void disk_free_zone_resources(struct gendisk *disk) mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; - disk_set_conv_zones_bitmap(disk, NULL); + disk_set_zones_cond_array(disk, NULL); disk->zone_capacity = 0; disk->last_zone_capacity = 0; disk->nr_zones = 0; @@ -1516,12 +1537,31 @@ static inline bool disk_need_zone_resources(struct gendisk *disk) queue_emulates_zone_append(disk->queue); } +struct blk_revalidate_zone_args { + struct gendisk *disk; + u8 *zones_cond; + unsigned int nr_zones; + unsigned int nr_conv_zones; + unsigned int zone_capacity; + unsigned int last_zone_capacity; + sector_t sector; +}; + static int disk_revalidate_zone_resources(struct gendisk *disk, - unsigned int nr_zones) + struct blk_revalidate_zone_args *args) { struct queue_limits *lim = &disk->queue->limits; unsigned int pool_size; + args->disk = disk; + args->nr_zones = + DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors); + + /* Cached zone conditions: 1 byte per zone */ + args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO); + if (!args->zones_cond) + return -ENOMEM; + if (!disk_need_zone_resources(disk)) return 0; @@ -1531,7 +1571,8 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, */ pool_size = max(lim->max_open_zones, lim->max_active_zones); if (!pool_size) - pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); + pool_size = + min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones); if (!disk->zone_wplugs_hash) return disk_alloc_zone_resources(disk, pool_size); @@ -1539,15 +1580,6 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, return 0; } -struct blk_revalidate_zone_args { - struct gendisk *disk; - unsigned long *conv_zones_bitmap; - unsigned int nr_zones; - unsigned int zone_capacity; - unsigned int last_zone_capacity; - sector_t sector; -}; - /* * Update the disk zone resources information and device queue limits. * The disk queue is frozen when this is executed. @@ -1556,7 +1588,7 @@ static int disk_update_zone_resources(struct gendisk *disk, struct blk_revalidate_zone_args *args) { struct request_queue *q = disk->queue; - unsigned int nr_seq_zones, nr_conv_zones; + unsigned int nr_seq_zones; unsigned int pool_size, memflags; struct queue_limits lim; int ret = 0; @@ -1566,24 +1598,24 @@ static int disk_update_zone_resources(struct gendisk *disk, memflags = blk_mq_freeze_queue(q); disk->nr_zones = args->nr_zones; - disk->zone_capacity = args->zone_capacity; - disk->last_zone_capacity = args->last_zone_capacity; - nr_conv_zones = - disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); - if (nr_conv_zones >= disk->nr_zones) { + if (args->nr_conv_zones >= disk->nr_zones) { pr_warn("%s: Invalid number of conventional zones %u / %u\n", - disk->disk_name, nr_conv_zones, disk->nr_zones); + disk->disk_name, args->nr_conv_zones, disk->nr_zones); ret = -ENODEV; goto unfreeze; } + disk->zone_capacity = args->zone_capacity; + disk->last_zone_capacity = args->last_zone_capacity; + disk_set_zones_cond_array(disk, args->zones_cond); + /* - * Some devices can advertize zone resource limits that are larger than + * Some devices can advertise zone resource limits that are larger than * the number of sequential zones of the zoned block device, e.g. a * small ZNS namespace. For such case, assume that the zoned device has * no zone resource limits. */ - nr_seq_zones = disk->nr_zones - nr_conv_zones; + nr_seq_zones = disk->nr_zones - args->nr_conv_zones; if (lim.max_open_zones >= nr_seq_zones) lim.max_open_zones = 0; if (lim.max_active_zones >= nr_seq_zones) @@ -1624,6 +1656,44 @@ unfreeze: return ret; } +static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx, + struct blk_revalidate_zone_args *args) +{ + enum blk_zone_cond cond = zone->cond; + + /* Check that the zone condition is consistent with the zone type. */ + switch (cond) { + case BLK_ZONE_COND_NOT_WP: + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) + goto invalid_condition; + break; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) + goto invalid_condition; + break; + default: + pr_warn("%s: Invalid zone condition 0x%X\n", + args->disk->disk_name, cond); + return -ENODEV; + } + + args->zones_cond[idx] = cond; + + return 0; + +invalid_condition: + pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n", + args->disk->disk_name, cond, zone->type); + + return -ENODEV; +} + static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, struct blk_revalidate_zone_args *args) { @@ -1638,17 +1708,7 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, if (disk_zone_is_last(disk, zone)) args->last_zone_capacity = zone->capacity; - if (!disk_need_zone_resources(disk)) - return 0; - - if (!args->conv_zones_bitmap) { - args->conv_zones_bitmap = - bitmap_zalloc(args->nr_zones, GFP_NOIO); - if (!args->conv_zones_bitmap) - return -ENOMEM; - } - - set_bit(idx, args->conv_zones_bitmap); + args->nr_conv_zones++; return 0; } @@ -1746,6 +1806,11 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENODEV; } + /* Check zone condition */ + ret = blk_revalidate_zone_cond(zone, idx, args); + if (ret) + return ret; + /* Check zone type */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: @@ -1813,10 +1878,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk) * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. */ - args.disk = disk; - args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); noio_flag = memalloc_noio_save(); - ret = disk_revalidate_zone_resources(disk, args.nr_zones); + ret = disk_revalidate_zone_resources(disk, &args); if (ret) { memalloc_noio_restore(noio_flag); return ret; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2f75fb15f55f..53bcfbc2f68f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,7 +196,7 @@ struct gendisk { unsigned int nr_zones; unsigned int zone_capacity; unsigned int last_zone_capacity; - unsigned long __rcu *conv_zones_bitmap; + u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; spinlock_t zone_wplugs_lock; @@ -925,12 +925,20 @@ static inline unsigned int bdev_zone_capacity(struct block_device *bdev, { return disk_zone_capacity(bdev->bd_disk, pos); } + +bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector); + #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } +static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) +{ + return false; +} + static inline bool bio_needs_zone_write_plugging(struct bio *bio) { return false; @@ -1533,33 +1541,6 @@ static inline bool bdev_is_zone_aligned(struct block_device *bdev, return bdev_is_zone_start(bdev, sector); } -/** - * bdev_zone_is_seq - check if a sector belongs to a sequential write zone - * @bdev: block device to check - * @sector: sector number - * - * Check if @sector on @bdev is contained in a sequential write required zone. - */ -static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) -{ - bool is_seq = false; - -#if IS_ENABLED(CONFIG_BLK_DEV_ZONED) - if (bdev_is_zoned(bdev)) { - struct gendisk *disk = bdev->bd_disk; - unsigned long *bitmap; - - rcu_read_lock(); - bitmap = rcu_dereference(disk->conv_zones_bitmap); - is_seq = !bitmap || - !test_bit(disk_zone_no(disk, sector), bitmap); - rcu_read_unlock(); - } -#endif - - return is_seq; -} - int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); -- cgit v1.2.3 From f2284eec5053df271c78e687672247922bcee881 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:43 +0900 Subject: block: introduce blkdev_get_zone_info() Introduce the function blkdev_get_zone_info() to obtain a single zone information from cached zone data, that is, either from the zone write plug for the target zone if it exists and from the disk zones_cond array otherwise. Since sequential zones that do not have a zone write plug are either full, empty or in a bad state (read-only or offline), the zone write pointer can be inferred from the zone condition cached in the disk zones_cond array. For sequential zones that have a zone write plug, the zone condition and zone write pointer are obtained from the condition and write pointer offset managed with the zone write plug. This allows obtaining the information for a zone much more quickly than having to execute a report zones command on the device. blkdev_get_zone_info() falls back to using a regular zone report if the target zone is flagged as needing an update with the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag, or if the target device does not use zone write plugs (i.e. a device mapper device). In this case, the new function blkdev_report_zone_fallback() is used and the zone condition is reported consistantly with the cahced report, that is, the BLK_ZONE_COND_ACTIVE condition is used in place of the implicit open, explicit open and closed conditions. This is achieved by adding the .report_active field to struct blk_report_zones_args and by having disk_report_zone() sets the correct zone condition if .report_active is true. In preparation for using blkdev_get_zone_info() in upcoming file systems changes, also export this function as a GPL symbol. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 3 ++ 2 files changed, 144 insertions(+) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 9ce7570c91e1..d98babfe49df 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -203,6 +203,7 @@ EXPORT_SYMBOL_GPL(bdev_zone_is_seq); struct blk_report_zones_args { report_zones_cb cb; void *data; + bool report_active; }; static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector, @@ -820,6 +821,23 @@ static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, unsigned int idx, struct blk_report_zones_args *args) { + if (args->report_active) { + /* + * If we come here, then this is a report zones as a fallback + * for a cached report. So collapse the implicit open, explicit + * open and closed conditions into the active zone condition. + */ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + zone->cond = BLK_ZONE_COND_ACTIVE; + break; + default: + break; + } + } + if (disk->zone_wplugs_hash) disk_zone_wplug_sync_wp_offset(disk, zone); @@ -830,6 +848,129 @@ int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, } EXPORT_SYMBOL_GPL(disk_report_zone); +static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + memcpy(data, zone, sizeof(struct blk_zone)); + return 0; +} + +static int blkdev_report_zone_fallback(struct block_device *bdev, + sector_t sector, struct blk_zone *zone) +{ + struct blk_report_zones_args args = { + .cb = blkdev_report_zone_cb, + .data = zone, + .report_active = true, + }; + + return blkdev_do_report_zones(bdev, sector, 1, &args); +} + +/** + * blkdev_get_zone_info - Get a single zone information from cached data + * @bdev: Target block device + * @sector: Sector contained by the target zone + * @zone: zone structure to return the zone information + * + * Description: + * Get the zone information for the zone containing @sector using the zone + * write plug of the target zone, if one exist, or the disk zone condition + * array otherwise. The zone condition may be reported as being + * the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit + * open, explicit open or closed condition. + * + * Returns 0 on success and a negative error code on failure. + */ +int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, + struct blk_zone *zone) +{ + struct gendisk *disk = bdev->bd_disk; + sector_t zone_sectors = bdev_zone_sectors(bdev); + struct blk_zone_wplug *zwplug; + unsigned long flags; + u8 *zones_cond; + + if (!bdev_is_zoned(bdev)) + return -EOPNOTSUPP; + + if (sector >= get_capacity(disk)) + return -EINVAL; + + memset(zone, 0, sizeof(*zone)); + sector = ALIGN_DOWN(sector, zone_sectors); + + rcu_read_lock(); + zones_cond = rcu_dereference(disk->zones_cond); + if (!disk->zone_wplugs_hash || !zones_cond) { + rcu_read_unlock(); + return blkdev_report_zone_fallback(bdev, sector, zone); + } + zone->cond = zones_cond[disk_zone_no(disk, sector)]; + rcu_read_unlock(); + + zone->start = sector; + zone->len = zone_sectors; + + /* + * If this is a conventional zone, we do not have a zone write plug and + * can report the zone immediately. + */ + if (zone->cond == BLK_ZONE_COND_NOT_WP) { + zone->type = BLK_ZONE_TYPE_CONVENTIONAL; + zone->capacity = zone_sectors; + zone->wp = ULLONG_MAX; + return 0; + } + + /* + * This is a sequential write required zone. If the zone is read-only or + * offline, only set the zone write pointer to an invalid value and + * report the zone. + */ + zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; + if (disk_zone_is_last(disk, zone)) + zone->capacity = disk->last_zone_capacity; + else + zone->capacity = disk->zone_capacity; + + if (zone->cond == BLK_ZONE_COND_READONLY || + zone->cond == BLK_ZONE_COND_OFFLINE) { + zone->wp = ULLONG_MAX; + return 0; + } + + /* + * If the zone does not have a zone write plug, it is either full or + * empty, as we otherwise would have a zone write plug for it. In this + * case, set the write pointer accordingly and report the zone. + * Otherwise, if we have a zone write plug, use it. + */ + zwplug = disk_get_zone_wplug(disk, sector); + if (!zwplug) { + if (zone->cond == BLK_ZONE_COND_FULL) + zone->wp = ULLONG_MAX; + else + zone->wp = sector; + return 0; + } + + spin_lock_irqsave(&zwplug->lock, flags); + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) { + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + return blkdev_report_zone_fallback(bdev, sector, zone); + } + zone->cond = zwplug->cond; + zone->wp = sector + zwplug->wp_offset; + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); + + return 0; +} +EXPORT_SYMBOL_GPL(blkdev_get_zone_info); + static void blk_zone_reset_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 53bcfbc2f68f..03a594b4dfbc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -436,6 +436,9 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, unsigned int idx, struct blk_report_zones_args *args); +int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, + struct blk_zone *zone); + #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); -- cgit v1.2.3 From 31f0656a4ab712edf2888eabcc0664197a4a938e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:44 +0900 Subject: block: introduce blkdev_report_zones_cached() Introduce the function blkdev_report_zones_cached() to provide a fast report zone built using the blkdev_get_zone_info() function, which gets zone information from a disk zones_cond array or zone write plugs. For a large capacity SMR drive, such fast report zone can be completed in a few milliseconds compared to several seconds completion times when the report zone is obtained from the device. The zone report is built in the same manner as with the regular blkdev_report_zones() function, that is, the first zone reported is the one containing the specified start sector and the report is limited to the specified number of zones (nr_zones argument). The information for each zone in the report is obtained using blkdev_get_zone_info(). For zoned devices that do not use zone write plug resources, using blkdev_get_zone_info() is inefficient as the zone report would be very slow, generated one zone at a time. To avoid this, blkdev_report_zones_cached() falls back to calling blkdev_do_report_zones() to execute a regular zone report. In this case, the .report_active field of struct blk_report_zones_args is set to true to report zone conditions using the BLK_ZONE_COND_ACTIVE condition in place of the implicit open, explicit open and closed conditions. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 88 ++++++++++++++++++++++++++++++++++++++++++-------- include/linux/blkdev.h | 2 ++ 2 files changed, 77 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index d98babfe49df..bbd105b11843 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -74,6 +74,19 @@ struct blk_zone_wplug { enum blk_zone_cond cond; }; +static inline bool disk_need_zone_resources(struct gendisk *disk) +{ + /* + * All request-based zoned devices need zone resources so that the + * block layer can automatically handle write BIO plugging. BIO-based + * device drivers (e.g. DM devices) are normally responsible for + * handling zone write ordering and do not need zone resources, unless + * the driver requires zone append emulation. + */ + return queue_is_mq(disk->queue) || + queue_emulates_zone_append(disk->queue); +} + static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) { return 1U << disk->zone_wplugs_hash_bits; @@ -971,6 +984,68 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(blkdev_get_zone_info); +/** + * blkdev_report_zones_cached - Get cached zones information + * @bdev: Target block device + * @sector: Sector from which to report zones + * @nr_zones: Maximum number of zones to report + * @cb: Callback function called for each reported zone + * @data: Private data for the callback function + * + * Description: + * Similar to blkdev_report_zones() but instead of calling into the low level + * device driver to get the zone report from the device, use + * blkdev_get_zone_info() to generate the report from the disk zone write + * plugs and zones condition array. Since calling this function without a + * callback does not make sense, @cb must be specified. + */ +int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct gendisk *disk = bdev->bd_disk; + sector_t capacity = get_capacity(disk); + sector_t zone_sectors = bdev_zone_sectors(bdev); + unsigned int idx = 0; + struct blk_zone zone; + int ret; + + if (!cb || !bdev_is_zoned(bdev) || + WARN_ON_ONCE(!disk->fops->report_zones)) + return -EOPNOTSUPP; + + if (!nr_zones || sector >= capacity) + return 0; + + /* + * If we do not have any zone write plug resources, fallback to using + * the regular zone report. + */ + if (!disk_need_zone_resources(disk)) { + struct blk_report_zones_args args = { + .cb = cb, + .data = data, + .report_active = true, + }; + + return blkdev_do_report_zones(bdev, sector, nr_zones, &args); + } + + for (sector = ALIGN_DOWN(sector, zone_sectors); + sector < capacity && idx < nr_zones; + sector += zone_sectors, idx++) { + ret = blkdev_get_zone_info(bdev, sector, &zone); + if (ret) + return ret; + + ret = cb(&zone, idx, data); + if (ret) + return ret; + } + + return idx; +} +EXPORT_SYMBOL_GPL(blkdev_report_zones_cached); + static void blk_zone_reset_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; @@ -1781,19 +1856,6 @@ void disk_free_zone_resources(struct gendisk *disk) disk->nr_zones = 0; } -static inline bool disk_need_zone_resources(struct gendisk *disk) -{ - /* - * All mq zoned devices need zone resources so that the block layer - * can automatically handle write BIO plugging. BIO-based device drivers - * (e.g. DM devices) are normally responsible for handling zone write - * ordering and do not need zone resources, unless the driver requires - * zone append emulation. - */ - return queue_is_mq(disk->queue) || - queue_emulates_zone_append(disk->queue); -} - struct blk_revalidate_zone_args { struct gendisk *disk; u8 *zones_cond; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 03a594b4dfbc..f0ab02e0a673 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -442,6 +442,8 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); +int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk); -- cgit v1.2.3 From 0a0da3f92118950862700497bc7917f0fbf6a6e8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:24 -0700 Subject: KVM: Make support for kvm_arch_vcpu_async_ioctl() mandatory Implement kvm_arch_vcpu_async_ioctl() "natively" in x86 and arm64 instead of relying on an #ifdef'd stub, and drop HAVE_KVM_VCPU_ASYNC_IOCTL in anticipation of using the API on x86. Once x86 uses the API, providing a stub for one architecture and having all other architectures opt-in requires more code than simply implementing the API in the lone holdout. Eliminating the Kconfig will also reduce churn if the API is renamed in the future (spoiler alert). No functional change intended. Acked-by: Claudio Imbrenda Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 6 ++++++ arch/loongarch/kvm/Kconfig | 1 - arch/mips/kvm/Kconfig | 1 - arch/powerpc/kvm/Kconfig | 1 - arch/riscv/kvm/Kconfig | 1 - arch/s390/kvm/Kconfig | 1 - arch/x86/kvm/x86.c | 6 ++++++ include/linux/kvm_host.h | 10 ---------- virt/kvm/Kconfig | 3 --- 9 files changed, 12 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..ef5bf57f79b7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1835,6 +1835,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } +long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index ae64bbdf83a7..ed4f724db774 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -25,7 +25,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_MSI select HAVE_KVM_READONLY_MEM - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index ab57221fa4dd..cc13cc35f208 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM select EXPORT_UASM select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO select KVM_GENERIC_MMU_NOTIFIER select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 2f2702c867f7..c9a2d50ff1b0 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM bool select KVM_COMMON - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_VFIO select HAVE_KVM_IRQ_BYPASS diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index c50328212917..77379f77840a 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI - select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_READONLY_MEM select HAVE_KVM_DIRTY_RING_ACQ_REL select KVM_COMMON diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index cae908d64550..96d16028e8b7 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC select KVM_COMMON diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4b5d2d09634..ca5ba2caf314 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7240,6 +7240,12 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } +long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..7186b2ae4b57 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2437,18 +2437,8 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); -#else -static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, - unsigned long arg) -{ - return -ENOIOCTLCMD; -} -#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 5f0015c5dd95..267c7369c765 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -78,9 +78,6 @@ config HAVE_KVM_IRQ_BYPASS tristate select IRQ_BYPASS_MANAGER -config HAVE_KVM_VCPU_ASYNC_IOCTL - bool - config HAVE_KVM_VCPU_RUN_PID_CHANGE bool -- cgit v1.2.3 From 50efc2340a598da4bafa40bc01e18f8cf73a4ae3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:25 -0700 Subject: KVM: Rename kvm_arch_vcpu_async_ioctl() to kvm_arch_vcpu_unlocked_ioctl() Rename the "async" ioctl API to "unlocked" so that upcoming usage in x86's TDX code doesn't result in a massive misnomer. To avoid having to retry SEAMCALLs, TDX needs to acquire kvm->lock *and* all vcpu->mutex locks, and acquiring all of those locks after/inside the current vCPU's mutex is a non-starter. However, TDX also needs to acquire the vCPU's mutex and load the vCPU, i.e. the handling is very much not async to the vCPU. No functional change intended. Acked-by: Claudio Imbrenda Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 4 ++-- arch/loongarch/kvm/vcpu.c | 4 ++-- arch/mips/kvm/mips.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 4 ++-- arch/riscv/kvm/vcpu.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 6 +++--- 9 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ef5bf57f79b7..cf23f6b07ec7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1835,8 +1835,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { return -ENOIOCTLCMD; } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 30e3b089a596..9a5844e85fd3 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1471,8 +1471,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) return 0; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { void __user *argp = (void __user *)arg; struct kvm_vcpu *vcpu = filp->private_data; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a75587018f44..b0fb92fda4d4 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -895,8 +895,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2ba057171ebe..9a89a6d98f97 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2028,8 +2028,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return -EINVAL; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index bccb919ca615..a4bd6077eecc 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -238,8 +238,8 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 16ba04062854..8c4caa5f2fcd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5730,8 +5730,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca5ba2caf314..b85cb213a336 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7240,8 +7240,8 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { return -ENOIOCTLCMD; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7186b2ae4b57..d93f75b05ae2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1557,6 +1557,8 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); @@ -2437,8 +2439,6 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..b7db1d5f71a8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4434,10 +4434,10 @@ static long kvm_vcpu_ioctl(struct file *filp, return r; /* - * Some architectures have vcpu ioctls that are asynchronous to vcpu - * execution; mutex_lock() would break them. + * Let arch code handle select vCPU ioctls without holding vcpu->mutex, + * e.g. to support ioctls that can run asynchronous to vCPU execution. */ - r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); + r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg); if (r != -ENOIOCTLCMD) return r; -- cgit v1.2.3 From e0b62a4dee24e9176f2c4be52a1b47fe1d97c560 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Nov 2025 15:46:33 +0100 Subject: fs: add fs/super_types.h header Split out super block associated structures into a separate header. Link: https://patch.msgid.link/20251104-work-fs-header-v1-2-fb39a2efe39e@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 308 +------------------------------------ include/linux/fs/super_types.h | 335 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 336 insertions(+), 307 deletions(-) create mode 100644 include/linux/fs/super_types.h (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3c971ddace41..ae71c359077a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H +#include #include #include #include @@ -11,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -52,11 +51,9 @@ #include #include -struct backing_dev_info; struct bdi_writeback; struct bio; struct io_comp_batch; -struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; @@ -70,12 +67,8 @@ struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; -struct workqueue_struct; struct iov_iter; -struct fscrypt_operations; -struct fsverity_operations; struct fsnotify_mark_connector; -struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct file_kattr; @@ -298,11 +291,6 @@ struct iattr { struct file *ia_file; }; -/* - * Includes for diskquotas. - */ -#include - /* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow @@ -1347,49 +1335,6 @@ extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct file *file); -/* - * sb->s_flags. Note that these mirror the equivalent MS_* flags where - * represented in both. - */ -#define SB_RDONLY BIT(0) /* Mount read-only */ -#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ -#define SB_NODEV BIT(2) /* Disallow access to device special files */ -#define SB_NOEXEC BIT(3) /* Disallow program execution */ -#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ -#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ -#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ -#define SB_NOATIME BIT(10) /* Do not update access times. */ -#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ -#define SB_SILENT BIT(15) -#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ -#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ -#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ -#define SB_I_VERSION BIT(23) /* Update inode I_version field */ -#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ - -/* These sb flags are internal to the kernel */ -#define SB_DEAD BIT(21) -#define SB_DYING BIT(24) -#define SB_FORCE BIT(27) -#define SB_NOSEC BIT(28) -#define SB_BORN BIT(29) -#define SB_ACTIVE BIT(30) -#define SB_NOUSER BIT(31) - -/* These flags relate to encoding and casefolding */ -#define SB_ENC_STRICT_MODE_FL (1 << 0) -#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) - -#define sb_has_strict_encoding(sb) \ - (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) - -#if IS_ENABLED(CONFIG_UNICODE) -#define sb_no_casefold_compat_fallback(sb) \ - (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) -#else -#define sb_no_casefold_compat_fallback(sb) (1) -#endif - /* * Umount options */ @@ -1400,191 +1345,6 @@ extern int send_sigurg(struct file *file); #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ -/* sb->s_iflags */ -#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ -#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ -#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ -#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ - -/* sb->s_iflags to limit user namespace mounts */ -#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ -#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 -#define SB_I_UNTRUSTED_MOUNTER 0x00000040 -#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 - -#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ -#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ -#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ -#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ -#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ -#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ -#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ - -/* Possible states of 'frozen' field */ -enum { - SB_UNFROZEN = 0, /* FS is unfrozen */ - SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ - SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ - SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop - * internal threads if needed) */ - SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ -}; - -#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) - -struct sb_writers { - unsigned short frozen; /* Is sb frozen? */ - int freeze_kcount; /* How many kernel freeze requests? */ - int freeze_ucount; /* How many userspace freeze requests? */ - const void *freeze_owner; /* Owner of the freeze */ - struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; -}; - -struct mount; - -struct super_block { - struct list_head s_list; /* Keep this first */ - dev_t s_dev; /* search index; _not_ kdev_t */ - unsigned char s_blocksize_bits; - unsigned long s_blocksize; - loff_t s_maxbytes; /* Max file size */ - struct file_system_type *s_type; - const struct super_operations *s_op; - const struct dquot_operations *dq_op; - const struct quotactl_ops *s_qcop; - const struct export_operations *s_export_op; - unsigned long s_flags; - unsigned long s_iflags; /* internal SB_I_* flags */ - unsigned long s_magic; - struct dentry *s_root; - struct rw_semaphore s_umount; - int s_count; - atomic_t s_active; -#ifdef CONFIG_SECURITY - void *s_security; -#endif - const struct xattr_handler * const *s_xattr; -#ifdef CONFIG_FS_ENCRYPTION - const struct fscrypt_operations *s_cop; - struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ -#endif -#ifdef CONFIG_FS_VERITY - const struct fsverity_operations *s_vop; -#endif -#if IS_ENABLED(CONFIG_UNICODE) - struct unicode_map *s_encoding; - __u16 s_encoding_flags; -#endif - struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ - struct mount *s_mounts; /* list of mounts; _not_ for fs use */ - struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ - struct file *s_bdev_file; - struct backing_dev_info *s_bdi; - struct mtd_info *s_mtd; - struct hlist_node s_instances; - unsigned int s_quota_types; /* Bitmask of supported quota types */ - struct quota_info s_dquot; /* Diskquota specific options */ - - struct sb_writers s_writers; - - /* - * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and - * s_fsnotify_info together for cache efficiency. They are frequently - * accessed and rarely modified. - */ - void *s_fs_info; /* Filesystem private info */ - - /* Granularity of c/m/atime in ns (cannot be worse than a second) */ - u32 s_time_gran; - /* Time limits for c/m/atime in seconds */ - time64_t s_time_min; - time64_t s_time_max; -#ifdef CONFIG_FSNOTIFY - u32 s_fsnotify_mask; - struct fsnotify_sb_info *s_fsnotify_info; -#endif - - /* - * q: why are s_id and s_sysfs_name not the same? both are human - * readable strings that identify the filesystem - * a: s_id is allowed to change at runtime; it's used in log messages, - * and we want to when a device starts out as single device (s_id is dev - * name) but then a device is hot added and we have to switch to - * identifying it by UUID - * but s_sysfs_name is a handle for programmatic access, and can't - * change at runtime - */ - char s_id[32]; /* Informational name */ - uuid_t s_uuid; /* UUID */ - u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ - - /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ - char s_sysfs_name[UUID_STRING_LEN + 1]; - - unsigned int s_max_links; - unsigned int s_d_flags; /* default d_flags for dentries */ - - /* - * The next field is for VFS *only*. No filesystems have any business - * even looking at it. You had been warned. - */ - struct mutex s_vfs_rename_mutex; /* Kludge */ - - /* - * Filesystem subtype. If non-empty the filesystem type field - * in /proc/mounts will be "type.subtype" - */ - const char *s_subtype; - - const struct dentry_operations *__s_d_op; /* default d_op for dentries */ - - struct shrinker *s_shrink; /* per-sb shrinker handle */ - - /* Number of inodes with nlink == 0 but still referenced */ - atomic_long_t s_remove_count; - - /* Read-only state of the superblock is being changed */ - int s_readonly_remount; - - /* per-sb errseq_t for reporting writeback errors via syncfs */ - errseq_t s_wb_err; - - /* AIO completions deferred from interrupt context */ - struct workqueue_struct *s_dio_done_wq; - struct hlist_head s_pins; - - /* - * Owning user namespace and default context in which to - * interpret filesystem uids, gids, quotas, device nodes, - * xattrs and security labels. - */ - struct user_namespace *s_user_ns; - - /* - * The list_lru structure is essentially just a pointer to a table - * of per-node lru lists, each of which has its own spinlock. - * There is no need to put them into separate cachelines. - */ - struct list_lru s_dentry_lru; - struct list_lru s_inode_lru; - struct rcu_head rcu; - struct work_struct destroy_work; - - struct mutex s_sync_lock; /* sync serialisation lock */ - - /* - * Indicates how deep in a filesystem stack this SB is - */ - int s_stack_depth; - - /* s_inode_list_lock protects s_inodes */ - spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; - struct list_head s_inodes; /* all inodes */ - - spinlock_t s_inode_wblist_lock; - struct list_head s_inodes_wb; /* writeback inodes */ -} __randomize_layout; - static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; @@ -2431,72 +2191,6 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); -/** - * enum freeze_holder - holder of the freeze - * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem - * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem - * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed - * @FREEZE_EXCL: a freeze that can only be undone by the owner - * - * Indicate who the owner of the freeze or thaw request is and whether - * the freeze needs to be exclusive or can nest. - * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the - * same holder aren't allowed. It is however allowed to hold a single - * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at - * the same time. This is relied upon by some filesystems during online - * repair or similar. - */ -enum freeze_holder { - FREEZE_HOLDER_KERNEL = (1U << 0), - FREEZE_HOLDER_USERSPACE = (1U << 1), - FREEZE_MAY_NEST = (1U << 2), - FREEZE_EXCL = (1U << 3), -}; - -struct super_operations { - struct inode *(*alloc_inode)(struct super_block *sb); - void (*destroy_inode)(struct inode *); - void (*free_inode)(struct inode *); - - void (*dirty_inode) (struct inode *, int flags); - int (*write_inode) (struct inode *, struct writeback_control *wbc); - int (*drop_inode) (struct inode *); - void (*evict_inode) (struct inode *); - void (*put_super) (struct super_block *); - int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_super) (struct super_block *, enum freeze_holder who, const void *owner); - int (*freeze_fs) (struct super_block *); - int (*thaw_super) (struct super_block *, enum freeze_holder who, const void *owner); - int (*unfreeze_fs) (struct super_block *); - int (*statfs) (struct dentry *, struct kstatfs *); - int (*remount_fs) (struct super_block *, int *, char *); - void (*umount_begin) (struct super_block *); - - int (*show_options)(struct seq_file *, struct dentry *); - int (*show_devname)(struct seq_file *, struct dentry *); - int (*show_path)(struct seq_file *, struct dentry *); - int (*show_stats)(struct seq_file *, struct dentry *); -#ifdef CONFIG_QUOTA - ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); - ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); - struct dquot __rcu **(*get_dquots)(struct inode *); -#endif - long (*nr_cached_objects)(struct super_block *, - struct shrink_control *); - long (*free_cached_objects)(struct super_block *, - struct shrink_control *); - /* - * If a filesystem can support graceful removal of a device and - * continue read-write operations, implement this callback. - * - * Return 0 if the filesystem can continue read-write. - * Non-zero return value or no such callback means the fs will be shutdown - * as usual. - */ - int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); - void (*shutdown)(struct super_block *sb); -}; - /* * Inode flags - they have no relation to superblock flags now */ diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h new file mode 100644 index 000000000000..45cfd45b9fe0 --- /dev/null +++ b/include/linux/fs/super_types.h @@ -0,0 +1,335 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FS_SUPER_TYPES_H +#define _LINUX_FS_SUPER_TYPES_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct backing_dev_info; +struct block_device; +struct dentry; +struct dentry_operations; +struct dquot_operations; +struct export_operations; +struct file; +struct file_system_type; +struct fscrypt_operations; +struct fsnotify_sb_info; +struct fsverity_operations; +struct kstatfs; +struct mount; +struct mtd_info; +struct quotactl_ops; +struct shrinker; +struct unicode_map; +struct user_namespace; +struct workqueue_struct; +struct writeback_control; +struct xattr_handler; + +extern struct super_block *blockdev_superblock; + +/* Possible states of 'frozen' field */ +enum { + SB_UNFROZEN = 0, /* FS is unfrozen */ + SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ + SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ + SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop internal threads if needed) */ + SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ +}; + +#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) + +struct sb_writers { + unsigned short frozen; /* Is sb frozen? */ + int freeze_kcount; /* How many kernel freeze requests? */ + int freeze_ucount; /* How many userspace freeze requests? */ + const void *freeze_owner; /* Owner of the freeze */ + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; +}; + +/** + * enum freeze_holder - holder of the freeze + * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem + * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem + * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed + * @FREEZE_EXCL: a freeze that can only be undone by the owner + * + * Indicate who the owner of the freeze or thaw request is and whether + * the freeze needs to be exclusive or can nest. + * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the + * same holder aren't allowed. It is however allowed to hold a single + * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at + * the same time. This is relied upon by some filesystems during online + * repair or similar. + */ +enum freeze_holder { + FREEZE_HOLDER_KERNEL = (1U << 0), + FREEZE_HOLDER_USERSPACE = (1U << 1), + FREEZE_MAY_NEST = (1U << 2), + FREEZE_EXCL = (1U << 3), +}; + +struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *inode); + void (*free_inode)(struct inode *inode); + void (*dirty_inode)(struct inode *inode, int flags); + int (*write_inode)(struct inode *inode, struct writeback_control *wbc); + int (*drop_inode)(struct inode *inode); + void (*evict_inode)(struct inode *inode); + void (*put_super)(struct super_block *sb); + int (*sync_fs)(struct super_block *sb, int wait); + int (*freeze_super)(struct super_block *sb, enum freeze_holder who, + const void *owner); + int (*freeze_fs)(struct super_block *sb); + int (*thaw_super)(struct super_block *sb, enum freeze_holder who, + const void *owner); + int (*unfreeze_fs)(struct super_block *sb); + int (*statfs)(struct dentry *dentry, struct kstatfs *kstatfs); + int (*remount_fs) (struct super_block *, int *, char *); + void (*umount_begin)(struct super_block *sb); + + int (*show_options)(struct seq_file *seq, struct dentry *dentry); + int (*show_devname)(struct seq_file *seq, struct dentry *dentry); + int (*show_path)(struct seq_file *seq, struct dentry *dentry); + int (*show_stats)(struct seq_file *seq, struct dentry *dentry); +#ifdef CONFIG_QUOTA + ssize_t (*quota_read)(struct super_block *sb, int type, char *data, + size_t len, loff_t off); + ssize_t (*quota_write)(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + struct dquot __rcu **(*get_dquots)(struct inode *inode); +#endif + long (*nr_cached_objects)(struct super_block *sb, + struct shrink_control *sc); + long (*free_cached_objects)(struct super_block *sb, + struct shrink_control *sc); + /* + * If a filesystem can support graceful removal of a device and + * continue read-write operations, implement this callback. + * + * Return 0 if the filesystem can continue read-write. + * Non-zero return value or no such callback means the fs will be shutdown + * as usual. + */ + int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); + void (*shutdown)(struct super_block *sb); +}; + +struct super_block { + struct list_head s_list; /* Keep this first */ + dev_t s_dev; /* search index; _not_ kdev_t */ + unsigned char s_blocksize_bits; + unsigned long s_blocksize; + loff_t s_maxbytes; /* Max file size */ + struct file_system_type *s_type; + const struct super_operations *s_op; + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; + unsigned long s_flags; + unsigned long s_iflags; /* internal SB_I_* flags */ + unsigned long s_magic; + struct dentry *s_root; + struct rw_semaphore s_umount; + int s_count; + atomic_t s_active; +#ifdef CONFIG_SECURITY + void *s_security; +#endif + const struct xattr_handler *const *s_xattr; +#ifdef CONFIG_FS_ENCRYPTION + const struct fscrypt_operations *s_cop; + struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ +#endif +#ifdef CONFIG_FS_VERITY + const struct fsverity_operations *s_vop; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + struct unicode_map *s_encoding; + __u16 s_encoding_flags; +#endif + struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ + struct mount *s_mounts; /* list of mounts; _not_ for fs use */ + struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ + struct file *s_bdev_file; + struct backing_dev_info *s_bdi; + struct mtd_info *s_mtd; + struct hlist_node s_instances; + unsigned int s_quota_types; /* Bitmask of supported quota types */ + struct quota_info s_dquot; /* Diskquota specific options */ + + struct sb_writers s_writers; + + /* + * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and + * s_fsnotify_info together for cache efficiency. They are frequently + * accessed and rarely modified. + */ + void *s_fs_info; /* Filesystem private info */ + + /* Granularity of c/m/atime in ns (cannot be worse than a second) */ + u32 s_time_gran; + /* Time limits for c/m/atime in seconds */ + time64_t s_time_min; + time64_t s_time_max; +#ifdef CONFIG_FSNOTIFY + u32 s_fsnotify_mask; + struct fsnotify_sb_info *s_fsnotify_info; +#endif + + /* + * q: why are s_id and s_sysfs_name not the same? both are human + * readable strings that identify the filesystem + * a: s_id is allowed to change at runtime; it's used in log messages, + * and we want to when a device starts out as single device (s_id is dev + * name) but then a device is hot added and we have to switch to + * identifying it by UUID + * but s_sysfs_name is a handle for programmatic access, and can't + * change at runtime + */ + char s_id[32]; /* Informational name */ + uuid_t s_uuid; /* UUID */ + u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ + + /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ + char s_sysfs_name[UUID_STRING_LEN + 1]; + + unsigned int s_max_links; + unsigned int s_d_flags; /* default d_flags for dentries */ + + /* + * The next field is for VFS *only*. No filesystems have any business + * even looking at it. You had been warned. + */ + struct mutex s_vfs_rename_mutex; /* Kludge */ + + /* + * Filesystem subtype. If non-empty the filesystem type field + * in /proc/mounts will be "type.subtype" + */ + const char *s_subtype; + + const struct dentry_operations *__s_d_op; /* default d_op for dentries */ + + struct shrinker *s_shrink; /* per-sb shrinker handle */ + + /* Number of inodes with nlink == 0 but still referenced */ + atomic_long_t s_remove_count; + + /* Read-only state of the superblock is being changed */ + int s_readonly_remount; + + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + + /* AIO completions deferred from interrupt context */ + struct workqueue_struct *s_dio_done_wq; + struct hlist_head s_pins; + + /* + * Owning user namespace and default context in which to + * interpret filesystem uids, gids, quotas, device nodes, + * xattrs and security labels. + */ + struct user_namespace *s_user_ns; + + /* + * The list_lru structure is essentially just a pointer to a table + * of per-node lru lists, each of which has its own spinlock. + * There is no need to put them into separate cachelines. + */ + struct list_lru s_dentry_lru; + struct list_lru s_inode_lru; + struct rcu_head rcu; + struct work_struct destroy_work; + + struct mutex s_sync_lock; /* sync serialisation lock */ + + /* + * Indicates how deep in a filesystem stack this SB is + */ + int s_stack_depth; + + /* s_inode_list_lock protects s_inodes */ + spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; + struct list_head s_inodes; /* all inodes */ + + spinlock_t s_inode_wblist_lock; + struct list_head s_inodes_wb; /* writeback inodes */ +} __randomize_layout; + +/* + * sb->s_flags. Note that these mirror the equivalent MS_* flags where + * represented in both. + */ +#define SB_RDONLY BIT(0) /* Mount read-only */ +#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ +#define SB_NODEV BIT(2) /* Disallow access to device special files */ +#define SB_NOEXEC BIT(3) /* Disallow program execution */ +#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ +#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ +#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ +#define SB_NOATIME BIT(10) /* Do not update access times. */ +#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ +#define SB_SILENT BIT(15) +#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ +#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ +#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ +#define SB_I_VERSION BIT(23) /* Update inode I_version field */ +#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ + +/* These sb flags are internal to the kernel */ +#define SB_DEAD BIT(21) +#define SB_DYING BIT(24) +#define SB_FORCE BIT(27) +#define SB_NOSEC BIT(28) +#define SB_BORN BIT(29) +#define SB_ACTIVE BIT(30) +#define SB_NOUSER BIT(31) + +/* These flags relate to encoding and casefolding */ +#define SB_ENC_STRICT_MODE_FL (1 << 0) +#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) + +#define sb_has_strict_encoding(sb) \ + (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) + +#if IS_ENABLED(CONFIG_UNICODE) +#define sb_no_casefold_compat_fallback(sb) \ + (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) +#else +#define sb_no_casefold_compat_fallback(sb) (1) +#endif + +/* sb->s_iflags */ +#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ +#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ +#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ +#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ + +/* sb->s_iflags to limit user namespace mounts */ +#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ +#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 +#define SB_I_UNTRUSTED_MOUNTER 0x00000040 +#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 + +#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ +#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ +#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ +#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ +#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ +#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ + +#endif /* _LINUX_FS_SUPER_TYPES_H */ -- cgit v1.2.3 From f7b3d14165222a3ad9c4d0d31dfa81e396751801 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Nov 2025 15:46:34 +0100 Subject: fs: add fs/super.h header Split out super block associated functions into a separate header. Link: https://patch.msgid.link/20251104-work-fs-header-v1-3-fb39a2efe39e@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 220 +------------------------------------------- include/linux/fs/super.h | 233 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 219 deletions(-) create mode 100644 include/linux/fs/super.h (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ae71c359077a..64af28318fbf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,7 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H -#include +#include #include #include #include @@ -1662,66 +1662,6 @@ struct timespec64 simple_inode_init_ts(struct inode *inode); * Snapshotting support. */ -/* - * These are internal functions, please use sb_start_{write,pagefault,intwrite} - * instead. - */ -static inline void __sb_end_write(struct super_block *sb, int level) -{ - percpu_up_read(sb->s_writers.rw_sem + level-1); -} - -static inline void __sb_start_write(struct super_block *sb, int level) -{ - percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); -} - -static inline bool __sb_start_write_trylock(struct super_block *sb, int level) -{ - return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); -} - -#define __sb_writers_acquired(sb, lev) \ - percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) -#define __sb_writers_release(sb, lev) \ - percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_) - -/** - * __sb_write_started - check if sb freeze level is held - * @sb: the super we write to - * @level: the freeze level - * - * * > 0 - sb freeze level is held - * * 0 - sb freeze level is not held - * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN - */ -static inline int __sb_write_started(const struct super_block *sb, int level) -{ - return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); -} - -/** - * sb_write_started - check if SB_FREEZE_WRITE is held - * @sb: the super we write to - * - * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. - */ -static inline bool sb_write_started(const struct super_block *sb) -{ - return __sb_write_started(sb, SB_FREEZE_WRITE); -} - -/** - * sb_write_not_started - check if SB_FREEZE_WRITE is not held - * @sb: the super we write to - * - * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. - */ -static inline bool sb_write_not_started(const struct super_block *sb) -{ - return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; -} - /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to @@ -1752,118 +1692,6 @@ static inline bool file_write_not_started(const struct file *file) return sb_write_not_started(file_inode(file)->i_sb); } -/** - * sb_end_write - drop write access to a superblock - * @sb: the super we wrote to - * - * Decrement number of writers to the filesystem. Wake up possible waiters - * wanting to freeze the filesystem. - */ -static inline void sb_end_write(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_WRITE); -} - -/** - * sb_end_pagefault - drop write access to a superblock from a page fault - * @sb: the super we wrote to - * - * Decrement number of processes handling write page fault to the filesystem. - * Wake up possible waiters wanting to freeze the filesystem. - */ -static inline void sb_end_pagefault(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_PAGEFAULT); -} - -/** - * sb_end_intwrite - drop write access to a superblock for internal fs purposes - * @sb: the super we wrote to - * - * Decrement fs-internal number of writers to the filesystem. Wake up possible - * waiters wanting to freeze the filesystem. - */ -static inline void sb_end_intwrite(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_FS); -} - -/** - * sb_start_write - get write access to a superblock - * @sb: the super we write to - * - * When a process wants to write data or metadata to a file system (i.e. dirty - * a page or an inode), it should embed the operation in a sb_start_write() - - * sb_end_write() pair to get exclusion against file system freezing. This - * function increments number of writers preventing freezing. If the file - * system is already frozen, the function waits until the file system is - * thawed. - * - * Since freeze protection behaves as a lock, users have to preserve - * ordering of freeze protection and other filesystem locks. Generally, - * freeze protection should be the outermost lock. In particular, we have: - * - * sb_start_write - * -> i_rwsem (write path, truncate, directory ops, ...) - * -> s_umount (freeze_super, thaw_super) - */ -static inline void sb_start_write(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_WRITE); -} - -static inline bool sb_start_write_trylock(struct super_block *sb) -{ - return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); -} - -/** - * sb_start_pagefault - get write access to a superblock from a page fault - * @sb: the super we write to - * - * When a process starts handling write page fault, it should embed the - * operation into sb_start_pagefault() - sb_end_pagefault() pair to get - * exclusion against file system freezing. This is needed since the page fault - * is going to dirty a page. This function increments number of running page - * faults preventing freezing. If the file system is already frozen, the - * function waits until the file system is thawed. - * - * Since page fault freeze protection behaves as a lock, users have to preserve - * ordering of freeze protection and other filesystem locks. It is advised to - * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault - * handling code implies lock dependency: - * - * mmap_lock - * -> sb_start_pagefault - */ -static inline void sb_start_pagefault(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_PAGEFAULT); -} - -/** - * sb_start_intwrite - get write access to a superblock for internal fs purposes - * @sb: the super we write to - * - * This is the third level of protection against filesystem freezing. It is - * free for use by a filesystem. The only requirement is that it must rank - * below sb_start_pagefault. - * - * For example filesystem can call sb_start_intwrite() when starting a - * transaction which somewhat eases handling of freezing for internal sources - * of filesystem changes (internal fs threads, discarding preallocation on file - * close, etc.). - */ -static inline void sb_start_intwrite(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_FS); -} - -static inline bool sb_start_intwrite_trylock(struct super_block *sb) -{ - return __sb_start_write_trylock(sb, SB_FREEZE_FS); -} - bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); @@ -2233,7 +2061,6 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, */ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) -static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; } #define IS_RDONLY(inode) sb_rdonly((inode)->i_sb) #define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \ ((inode)->i_flags & S_SYNC)) @@ -2467,10 +2294,6 @@ extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); -int freeze_super(struct super_block *super, enum freeze_holder who, - const void *freeze_owner); -int thaw_super(struct super_block *super, enum freeze_holder who, - const void *freeze_owner); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); @@ -2657,12 +2480,6 @@ extern struct kmem_cache *names_cachep; #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) -extern struct super_block *blockdev_superblock; -static inline bool sb_is_blkdev_sb(struct super_block *sb) -{ - return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; -} - void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; @@ -3117,9 +2934,6 @@ static inline void remove_inode_hash(struct inode *inode) extern void inode_sb_list_add(struct inode *inode); extern void inode_add_lru(struct inode *inode); -extern int sb_set_blocksize(struct super_block *, int); -extern int sb_min_blocksize(struct super_block *, int); - int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); @@ -3439,38 +3253,6 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir, } #endif -static inline struct unicode_map *sb_encoding(const struct super_block *sb) -{ -#if IS_ENABLED(CONFIG_UNICODE) - return sb->s_encoding; -#else - return NULL; -#endif -} - -static inline bool sb_has_encoding(const struct super_block *sb) -{ - return !!sb_encoding(sb); -} - -/* - * Compare if two super blocks have the same encoding and flags - */ -static inline bool sb_same_encoding(const struct super_block *sb1, - const struct super_block *sb2) -{ -#if IS_ENABLED(CONFIG_UNICODE) - if (sb1->s_encoding == sb2->s_encoding) - return true; - - return (sb1->s_encoding && sb2->s_encoding && - (sb1->s_encoding->version == sb2->s_encoding->version) && - (sb1->s_encoding_flags == sb2->s_encoding_flags)); -#else - return true; -#endif -} - int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h new file mode 100644 index 000000000000..c0d22b12c1c9 --- /dev/null +++ b/include/linux/fs/super.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FS_SUPER_H +#define _LINUX_FS_SUPER_H + +#include +#include + +/* + * These are internal functions, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +static inline void __sb_end_write(struct super_block *sb, int level) +{ + percpu_up_read(sb->s_writers.rw_sem + level - 1); +} + +static inline void __sb_start_write(struct super_block *sb, int level) +{ + percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); +} + +static inline bool __sb_start_write_trylock(struct super_block *sb, int level) +{ + return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); +} + +#define __sb_writers_acquired(sb, lev) \ + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev) - 1], 1, _THIS_IP_) +#define __sb_writers_release(sb, lev) \ + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev) - 1], _THIS_IP_) + +/** + * __sb_write_started - check if sb freeze level is held + * @sb: the super we write to + * @level: the freeze level + * + * * > 0 - sb freeze level is held + * * 0 - sb freeze level is not held + * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + */ +static inline int __sb_write_started(const struct super_block *sb, int level) +{ + return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); +} + +/** + * sb_write_started - check if SB_FREEZE_WRITE is held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE); +} + +/** + * sb_write_not_started - check if SB_FREEZE_WRITE is not held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_not_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; +} + +/** + * sb_end_write - drop write access to a superblock + * @sb: the super we wrote to + * + * Decrement number of writers to the filesystem. Wake up possible waiters + * wanting to freeze the filesystem. + */ +static inline void sb_end_write(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_WRITE); +} + +/** + * sb_end_pagefault - drop write access to a superblock from a page fault + * @sb: the super we wrote to + * + * Decrement number of processes handling write page fault to the filesystem. + * Wake up possible waiters wanting to freeze the filesystem. + */ +static inline void sb_end_pagefault(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_end_intwrite - drop write access to a superblock for internal fs purposes + * @sb: the super we wrote to + * + * Decrement fs-internal number of writers to the filesystem. Wake up possible + * waiters wanting to freeze the filesystem. + */ +static inline void sb_end_intwrite(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_FS); +} + +/** + * sb_start_write - get write access to a superblock + * @sb: the super we write to + * + * When a process wants to write data or metadata to a file system (i.e. dirty + * a page or an inode), it should embed the operation in a sb_start_write() - + * sb_end_write() pair to get exclusion against file system freezing. This + * function increments number of writers preventing freezing. If the file + * system is already frozen, the function waits until the file system is + * thawed. + * + * Since freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. Generally, + * freeze protection should be the outermost lock. In particular, we have: + * + * sb_start_write + * -> i_rwsem (write path, truncate, directory ops, ...) + * -> s_umount (freeze_super, thaw_super) + */ +static inline void sb_start_write(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_WRITE); +} + +static inline bool sb_start_write_trylock(struct super_block *sb) +{ + return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); +} + +/** + * sb_start_pagefault - get write access to a superblock from a page fault + * @sb: the super we write to + * + * When a process starts handling write page fault, it should embed the + * operation into sb_start_pagefault() - sb_end_pagefault() pair to get + * exclusion against file system freezing. This is needed since the page fault + * is going to dirty a page. This function increments number of running page + * faults preventing freezing. If the file system is already frozen, the + * function waits until the file system is thawed. + * + * Since page fault freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. It is advised to + * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault + * handling code implies lock dependency: + * + * mmap_lock + * -> sb_start_pagefault + */ +static inline void sb_start_pagefault(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_start_intwrite - get write access to a superblock for internal fs purposes + * @sb: the super we write to + * + * This is the third level of protection against filesystem freezing. It is + * free for use by a filesystem. The only requirement is that it must rank + * below sb_start_pagefault. + * + * For example filesystem can call sb_start_intwrite() when starting a + * transaction which somewhat eases handling of freezing for internal sources + * of filesystem changes (internal fs threads, discarding preallocation on file + * close, etc.). + */ +static inline void sb_start_intwrite(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_FS); +} + +static inline bool sb_start_intwrite_trylock(struct super_block *sb) +{ + return __sb_start_write_trylock(sb, SB_FREEZE_FS); +} + +static inline bool sb_rdonly(const struct super_block *sb) +{ + return sb->s_flags & SB_RDONLY; +} + +static inline bool sb_is_blkdev_sb(struct super_block *sb) +{ + return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; +} + +#if IS_ENABLED(CONFIG_UNICODE) +static inline struct unicode_map *sb_encoding(const struct super_block *sb) +{ + return sb->s_encoding; +} + +/* Compare if two super blocks have the same encoding and flags */ +static inline bool sb_same_encoding(const struct super_block *sb1, + const struct super_block *sb2) +{ + if (sb1->s_encoding == sb2->s_encoding) + return true; + + return (sb1->s_encoding && sb2->s_encoding && + (sb1->s_encoding->version == sb2->s_encoding->version) && + (sb1->s_encoding_flags == sb2->s_encoding_flags)); +} +#else +static inline struct unicode_map *sb_encoding(const struct super_block *sb) +{ + return NULL; +} + +static inline bool sb_same_encoding(const struct super_block *sb1, + const struct super_block *sb2) +{ + return true; +} +#endif + +static inline bool sb_has_encoding(const struct super_block *sb) +{ + return !!sb_encoding(sb); +} + +int sb_set_blocksize(struct super_block *sb, int size); +int sb_min_blocksize(struct super_block *sb, int size); + +int freeze_super(struct super_block *super, enum freeze_holder who, + const void *freeze_owner); +int thaw_super(struct super_block *super, enum freeze_holder who, + const void *freeze_owner); + +#endif /* _LINUX_FS_SUPER_H */ -- cgit v1.2.3 From 5b8ed52866e3d19e02860c7cf1d6bbbd70b619e9 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Nov 2025 18:04:48 +0100 Subject: fs: inline current_umask() and move it to fs_struct.h There is no good reason to have this as a func call, other than avoiding the churn of adding fs_struct.h as needed. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251104170448.630414-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/9p/acl.c | 1 + fs/btrfs/inode.c | 1 + fs/f2fs/acl.c | 1 + fs/fat/inode.c | 1 + fs/fs_struct.c | 6 ------ fs/hfsplus/options.c | 1 + fs/hpfs/super.c | 1 + fs/nilfs2/nilfs.h | 1 + fs/ntfs3/super.c | 1 + fs/ocfs2/acl.c | 1 + fs/omfs/inode.c | 1 + fs/smb/client/file.c | 1 + fs/smb/client/inode.c | 1 + fs/smb/client/smb1ops.c | 1 + include/linux/fs.h | 2 -- include/linux/fs_struct.h | 6 ++++++ include/linux/namei.h | 1 + 17 files changed, 20 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/9p/acl.c b/fs/9p/acl.c index eed551d8555f..633da5e37299 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b1b3a0553ee..dee166426511 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index d4d7f329d23f..fa8d81a30fb9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -9,6 +9,7 @@ * * Copyright (C) 2001-2003 Andreas Gruenbacher, */ +#include #include #include "f2fs.h" #include "xattr.h" diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 9648ed097816..309e560038dd 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "fat.h" #ifndef CONFIG_FAT_DEFAULT_IOCHARSET diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 28be762ac1c6..b8c46c5a38a0 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -146,12 +146,6 @@ int unshare_fs_struct(void) } EXPORT_SYMBOL_GPL(unshare_fs_struct); -int current_umask(void) -{ - return current->fs->umask; -} -EXPORT_SYMBOL(current_umask); - /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index a66a09a56bf7..9b377481f397 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 8ab85e7ac91e..371aa6de8075 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include +#include #include #include #include diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index f466daa39440..b7e3d91b6243 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include "the_nilfs.h" diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index ddff94c091b8..8d09dfec970a 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 62464d194da3..af1e2cedb217 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 135c49c5d848..89dc093f2752 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include "omfs.h" diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 474dadeb1593..9dc0a968ec89 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -9,6 +9,7 @@ * */ #include +#include #include #include #include diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index cac355364e43..28a73717851c 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -6,6 +6,7 @@ * */ #include +#include #include #include #include diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index ca8f3dd7ff63..78650527d4bb 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -7,6 +7,7 @@ #include #include +#include #include #include "cifsglob.h" #include "cifsproto.h" diff --git a/include/linux/fs.h b/include/linux/fs.h index 64af28318fbf..c0c0095b2b60 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2336,8 +2336,6 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch va_end(args); } -extern int current_umask(void); - extern void ihold(struct inode * inode); extern void iput(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index baf200ab5c77..0070764b790a 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_STRUCT_H #define _LINUX_FS_STRUCT_H +#include #include #include #include @@ -41,4 +42,9 @@ static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) extern bool current_chrooted(void); +static inline int current_umask(void) +{ + return current->fs->umask; +} + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/include/linux/namei.h b/include/linux/namei.h index fed86221c69c..b0679c7420a8 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -7,6 +7,7 @@ #include #include #include +#include enum { MAX_NESTED_LINKS = 8 }; -- cgit v1.2.3 From 8e4d576ed3ff917eda65b989ba56b02d9a3894f9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Nov 2025 13:12:30 +0100 Subject: fs: add super_write_guard Link: https://patch.msgid.link/20251104-work-guards-v1-1-5108ac78a171@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs/super.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h index c0d22b12c1c9..b874105743b3 100644 --- a/include/linux/fs/super.h +++ b/include/linux/fs/super.h @@ -125,6 +125,11 @@ static inline void sb_start_write(struct super_block *sb) __sb_start_write(sb, SB_FREEZE_WRITE); } +DEFINE_GUARD(super_write, + struct super_block *, + sb_start_write(_T), + sb_end_write(_T)) + static inline bool sb_start_write_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); -- cgit v1.2.3 From 4868d2d52df6f724b01531843805a3b1322e2dd9 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Thu, 30 Oct 2025 09:57:43 +0800 Subject: crypto: hisilicon - qm updates BAR configuration On new platforms greater than QM_HW_V3, the configuration region for the live migration function of the accelerator device is no longer placed in the VF, but is instead placed in the PF. Therefore, the configuration region of the live migration function needs to be opened when the QM driver is loaded. When the QM driver is uninstalled, the driver needs to clear this configuration. Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20251030015744.131771-2-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/crypto/hisilicon/qm.c | 27 +++++++++++++++++++++++++++ include/linux/hisi_acc_qm.h | 3 +++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index a5b96adf2d1e..e88085c2cbb3 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3019,11 +3019,36 @@ static void qm_put_pci_res(struct hisi_qm *qm) pci_release_mem_regions(pdev); } +static void hisi_mig_region_clear(struct hisi_qm *qm) +{ + u32 val; + + /* Clear migration region set of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val &= ~QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + +static void hisi_mig_region_enable(struct hisi_qm *qm) +{ + u32 val; + + /* Select migration region of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val |= QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + static void hisi_qm_pci_uninit(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; pci_free_irq_vectors(pdev); + hisi_mig_region_clear(qm); qm_put_pci_res(qm); pci_disable_device(pdev); } @@ -5725,6 +5750,7 @@ int hisi_qm_init(struct hisi_qm *qm) goto err_free_qm_memory; qm_cmd_init(qm); + hisi_mig_region_enable(qm); return 0; @@ -5863,6 +5889,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm) } qm_cmd_init(qm); + hisi_mig_region_enable(qm); hisi_qm_dev_err_init(qm); /* Set the doorbell timeout to QM_DB_TIMEOUT_CFG ns. */ writel(QM_DB_TIMEOUT_SET, qm->io_base + QM_DB_TIMEOUT_CFG); diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index c4690e365ade..ca1ec437a3ca 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -99,6 +99,9 @@ #define QM_DEV_ALG_MAX_LEN 256 +#define QM_MIG_REGION_SEL 0x100198 +#define QM_MIG_REGION_EN BIT(0) + /* uacce mode of the driver */ #define UACCE_MODE_NOUACCE 0 /* don't use uacce */ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ -- cgit v1.2.3 From 313a335057f0894e6e59290d4e7fb8b35ec250e6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 15:57:33 +0100 Subject: coredump: mark struct mm_struct as const We don't actually modify it. Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-7-b447b82f2c9b@kernel.org Signed-off-by: Christian Brauner --- fs/coredump.c | 2 +- include/linux/sched/coredump.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/coredump.c b/fs/coredump.c index 590360ba0a28..8253b28bc728 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1092,7 +1092,7 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) size_t *argv __free(kfree) = NULL; struct core_state core_state; struct core_name cn; - struct mm_struct *mm = current->mm; + const struct mm_struct *mm = current->mm; const struct linux_binfmt *binfmt = mm->binfmt; const struct cred *old_cred; int argc = 0; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index b7fafe999073..624fda17a785 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,7 +8,7 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ -static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm) +static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) { /* * By convention, dumpable bits are contained in first 32 bits of the -- cgit v1.2.3 From 34dc27f02cb3799d56a99002261e4d091da0cea4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:02 -0800 Subject: srcu: Create an srcu_expedite_current() function This commit creates an srcu_expedite_current() function that expedites the current (and possibly the next) SRCU grace period for the specified srcu_struct structure. This functionality will be inherited by RCU Tasks Trace courtesy of its mapping to SRCU fast. If the current SRCU grace period is already waiting, that wait will complete before the expediting takes effect. If there is no SRCU grace period in flight, this function might well create one. [ paulmck: Apply Zqiang feedback for PREEMPT_RT use. ] Signed-off-by: Paul E. McKenney Cc: Andrii Nakryiko Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: Signed-off-by: Frederic Weisbecker --- include/linux/srcutiny.h | 1 + include/linux/srcutree.h | 8 +++++++ kernel/rcu/srcutree.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 51ce25f07930..3bfbd44cb1b3 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -103,6 +103,7 @@ static inline void srcu_barrier(struct srcu_struct *ssp) synchronize_srcu(ssp); } +static inline void srcu_expedite_current(struct srcu_struct *ssp) { } #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) #define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 42098e0fa0b7..93ad18acd6d0 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -42,6 +42,8 @@ struct srcu_data { struct timer_list delay_work; /* Delay for CB invoking */ struct work_struct work; /* Context for CB invoking. */ struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ + struct rcu_head srcu_ec_head; /* For srcu_expedite_current() use. */ + int srcu_ec_state; /* State for srcu_expedite_current(). */ struct srcu_node *mynode; /* Leaf srcu_node. */ unsigned long grpmask; /* Mask for leaf srcu_node */ /* ->srcu_data_have_cbs[]. */ @@ -135,6 +137,11 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 +/* Values for srcu_expedite_current() state (->srcu_ec_state). */ +#define SRCU_EC_IDLE 0 +#define SRCU_EC_PENDING 1 +#define SRCU_EC_REPOST 2 + /* * Values for initializing gp sequence fields. Higher values allow wrap arounds to * occur earlier. @@ -210,6 +217,7 @@ struct srcu_struct { int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); +void srcu_expedite_current(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); // Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 1ff94b76d91f..38b440b0b0c8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1688,6 +1688,64 @@ void srcu_barrier(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(srcu_barrier); +/* Callback for srcu_expedite_current() usage. */ +static void srcu_expedite_current_cb(struct rcu_head *rhp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head); + + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + WARN_ON_ONCE(1); + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_IDLE; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, requeue ourselves as an expedited SRCU callback. + if (needcb) + __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); +} + +/** + * srcu_expedite_current - Expedite the current SRCU grace period + * @ssp: srcu_struct to expedite. + * + * Cause the current SRCU grace period to become expedited. The grace + * period following the current one might also be expedited. If there is + * no current grace period, one might be created. If the current grace + * period is currently sleeping, that sleep will complete before expediting + * will take effect. + */ +void srcu_expedite_current(struct srcu_struct *ssp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp; + + migrate_disable(); + sdp = this_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_REPOST; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, queue an expedited SRCU callback. + if (needcb) + __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); + migrate_enable(); +} +EXPORT_SYMBOL_GPL(srcu_expedite_current); + /** * srcu_batches_completed - return batches completed. * @ssp: srcu_struct on which to report batch completion. -- cgit v1.2.3 From ee90848499b169070dbf85a4276a45ccbb7ff7d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:04 -0800 Subject: srcu: Create a DEFINE_SRCU_FAST() This commit creates DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST() macros that are similar to DEFINE_SRCU() and DEFINE_STATIC_SRCU(), but which create srcu_struct structures that are usable only by readers initiated by srcu_read_lock_fast() and friends. This commit does make DEFINE_SRCU_FAST() available to modules, in which case the per-CPU srcu_data structures are not created at compile time, but rather at module-load time. This means that the >srcu_reader_flavor field of the srcu_data structure is not available. Therefore, this commit instead creates an ->srcu_reader_flavor field in the srcu_struct structure, adds arguments to the DEFINE_SRCU()-related macros to initialize this new field, and extends the checks in the __srcu_check_read_flavor() function to include this new field. This commit also allows dynamically allocated srcu_struct structure to be marked for SRCU-fast readers. It does so by defining a new init_srcu_struct_fast() function that marks the specified srcu_struct structure for use by srcu_read_lock_fast() and friends. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Frederic Weisbecker --- include/linux/notifier.h | 2 +- include/linux/srcu.h | 16 ++++++++++++++-- include/linux/srcutiny.h | 13 ++++++++++--- include/linux/srcutree.h | 30 +++++++++++++++++++----------- kernel/rcu/srcutree.c | 36 ++++++++++++++++++++++++++++++++++-- 5 files changed, 78 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index b42e64734968..01b6c9d9956f 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -109,7 +109,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ .srcuu = __SRCU_USAGE_INIT(name.srcuu), \ - .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ + .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu, 0), \ } #define ATOMIC_NOTIFIER_HEAD(name) \ diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ada65b58bc4c..26de47820c58 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -25,8 +25,10 @@ struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *ssp, const char *name, - struct lock_class_key *key); +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); +#ifndef CONFIG_TINY_SRCU +int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); +#endif // #ifndef CONFIG_TINY_SRCU #define init_srcu_struct(ssp) \ ({ \ @@ -35,10 +37,20 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name, __init_srcu_struct((ssp), #ssp, &__srcu_key); \ }) +#define init_srcu_struct_fast(ssp) \ +({ \ + static struct lock_class_key __srcu_key; \ + \ + __init_srcu_struct_fast((ssp), #ssp, &__srcu_key); \ +}) + #define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name }, #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *ssp); +#ifndef CONFIG_TINY_SRCU +int init_srcu_struct_fast(struct srcu_struct *ssp); +#endif // #ifndef CONFIG_TINY_SRCU #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 3bfbd44cb1b3..92e6ab53398f 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -31,7 +31,7 @@ struct srcu_struct { void srcu_drive_gp(struct work_struct *wp); -#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored) \ +#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored, ____ignored) \ { \ .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ .srcu_cb_tail = &name.srcu_cb_head, \ @@ -44,13 +44,20 @@ void srcu_drive_gp(struct work_struct *wp); * Tree SRCU, which needs some per-CPU data. */ #define DEFINE_SRCU(name) \ - struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) + struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) #define DEFINE_STATIC_SRCU(name) \ - static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) +#define DEFINE_SRCU_FAST(name) DEFINE_SRCU(name) +#define DEFINE_STATIC_SRCU_FAST(name) \ + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) // Dummy structure for srcu_notifier_head. struct srcu_usage { }; #define __SRCU_USAGE_INIT(name) { } +#define __init_srcu_struct_fast __init_srcu_struct +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define init_srcu_struct_fast init_srcu_struct +#endif // #ifndef CONFIG_DEBUG_LOCK_ALLOC void synchronize_srcu(struct srcu_struct *ssp); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 93ad18acd6d0..7ff4a11bc5a3 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -104,6 +104,7 @@ struct srcu_usage { struct srcu_struct { struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + u8 srcu_reader_flavor; struct lockdep_map dep_map; struct srcu_usage *srcu_sup; /* Update-side data. */ }; @@ -162,20 +163,21 @@ struct srcu_struct { .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ } -#define __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ +#define __SRCU_STRUCT_INIT_COMMON(name, usage_name, fast) \ .srcu_sup = &usage_name, \ + .srcu_reader_flavor = fast, \ __SRCU_DEP_MAP_INIT(name) -#define __SRCU_STRUCT_INIT_MODULE(name, usage_name) \ +#define __SRCU_STRUCT_INIT_MODULE(name, usage_name, fast) \ { \ - __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ + __SRCU_STRUCT_INIT_COMMON(name, usage_name, fast) \ } -#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ +#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name, fast) \ { \ .sda = &pcpu_name, \ .srcu_ctrp = &pcpu_name.srcu_ctrs[0], \ - __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ + __SRCU_STRUCT_INIT_COMMON(name, usage_name, fast) \ } /* @@ -196,23 +198,29 @@ struct srcu_struct { * init_srcu_struct(&my_srcu); * * See include/linux/percpu-defs.h for the rules on per-CPU variables. + * + * DEFINE_SRCU_FAST() creates an srcu_struct and associated structures + * whose readers must be of the SRCU-fast variety. */ #ifdef MODULE -# define __DEFINE_SRCU(name, is_static) \ +# define __DEFINE_SRCU(name, fast, is_static) \ static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage); \ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage, \ + fast); \ extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else -# define __DEFINE_SRCU(name, is_static) \ +# define __DEFINE_SRCU(name, fast, is_static) \ static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ is_static struct srcu_struct name = \ - __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data) + __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data, fast) #endif -#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) -#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, 0, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, 0, static) +#define DEFINE_SRCU_FAST(name) __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, /* not static */) +#define DEFINE_STATIC_SRCU_FAST(name) __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, static) int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void synchronize_srcu_expedited(struct srcu_struct *ssp); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 38b440b0b0c8..9869a13b8763 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -286,16 +286,29 @@ err_free_sup: #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *ssp, const char *name, - struct lock_class_key *key) +static int +__init_srcu_struct_common(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); lockdep_init_map(&ssp->dep_map, name, key, 0); return init_srcu_struct_fields(ssp, false); } + +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = 0; + return __init_srcu_struct_common(ssp, name, key); +} EXPORT_SYMBOL_GPL(__init_srcu_struct); +int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast); + #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** @@ -308,10 +321,26 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *ssp) { + ssp->srcu_reader_flavor = 0; return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); +/** + * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure + * @ssp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct_fast(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast); + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* @@ -734,6 +763,9 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) sdp = raw_cpu_ptr(ssp->sda); old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); + WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor && + old_read_flavor != ssp->srcu_reader_flavor); if (!old_read_flavor) { old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); if (!old_read_flavor) -- cgit v1.2.3 From 8235bcfd39e865763e764b4c968012bdfb808af1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:07 -0800 Subject: srcu: Require special srcu_struct define/init for SRCU-fast readers This commit adds CONFIG_PROVE_RCU=y checking to enforce the new rule that srcu_struct structures passed to srcu_read_lock_fast() and other SRCU-fast read-side markers be either initialized with init_srcu_struct_fast() on the one hand or defined using either DEFINE_SRCU_FAST() or DEFINE_STATIC_SRCU_FAST(). This will enable removal of the non-debug read-side checks from srcu_read_lock_fast() and friends, which on my laptop provides a 25% speedup (which admittedly amounts to about half a nanosecond, but when tracing fastpaths...) Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 34 ++++++++++++++++++++++------------ kernel/rcu/srcutree.c | 1 + 2 files changed, 23 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 26de47820c58..2982b5a6930f 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -271,17 +271,26 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section, but for a light-weight - * smp_mb()-free reader. See srcu_read_lock() for more information. - * - * If srcu_read_lock_fast() is ever used on an srcu_struct structure, - * then none of the other flavors may be used, whether before, during, - * or after. Note that grace-period auto-expediting is disabled for _fast - * srcu_struct structures because auto-expedited grace periods invoke - * synchronize_rcu_expedited(), IPIs and all. - * - * Note that srcu_read_lock_fast() can be invoked only from those contexts - * where RCU is watching, that is, from contexts where it would be legal - * to invoke rcu_read_lock(). Otherwise, lockdep will complain. + * smp_mb()-free reader. See srcu_read_lock() for more information. This + * function is NMI-safe, in a manner similar to srcu_read_lock_nmisafe(). + * + * For srcu_read_lock_fast() to be used on an srcu_struct structure, + * that structure must have been defined using either DEFINE_SRCU_FAST() + * or DEFINE_STATIC_SRCU_FAST() on the one hand or initialized with + * init_srcu_struct_fast() on the other. Such an srcu_struct structure + * cannot be passed to any non-fast variant of srcu_read_{,un}lock() or + * srcu_{down,up}_read(). In kernels built with CONFIG_PROVE_RCU=y, + * __srcu_check_read_flavor() will complain bitterly if you ignore this + * restriction. + * + * Grace-period auto-expediting is disabled for SRCU-fast srcu_struct + * structures because SRCU-fast expedited grace periods invoke + * synchronize_rcu_expedited(), IPIs and all. If you need expedited + * SRCU-fast grace periods, use synchronize_srcu_expedited(). + * + * The srcu_read_lock_fast() function can be invoked only from those + * contexts where RCU is watching, that is, from contexts where it would + * be legal to invoke rcu_read_lock(). Otherwise, lockdep will complain. */ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp) { @@ -317,7 +326,8 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_ * srcu_down_read() for more information. * * The same srcu_struct may be used concurrently by srcu_down_read_fast() - * and srcu_read_lock_fast(). + * and srcu_read_lock_fast(). However, the same definition/initialization + * requirements called out for srcu_read_lock_safe() apply. */ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp) { diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c29203b23d1a..2f8aa280911e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -766,6 +766,7 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor); WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor && old_read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(read_flavor == SRCU_READ_FLAVOR_FAST && !ssp->srcu_reader_flavor); if (!old_read_flavor) { old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); if (!old_read_flavor) -- cgit v1.2.3 From ac51c40c2c148a75f3191ff401c9889a7fc12cb1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:08 -0800 Subject: srcu: Make SRCU-fast readers enforce use of SRCU-fast definition/init This commit makes CONFIG_PROVE_RCU=y kernels enforce the new rule that srcu_struct structures that are passed to srcu_read_lock_fast() and other SRCU-fast read-side markers be either initialized with init_srcu_struct_fast() on the one hand or defined with DEFINE_SRCU_FAST() or DEFINE_STATIC_SRCU_FAST() on the other. This eliminates the read-side test that was formerly included in srcu_read_lock_fast() and friends, speeding these primitives up by about 25% (admittedly only about half of a nanosecond, but when tracing on fastpaths...) Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 6 +++--- include/linux/srcutiny.h | 1 - include/linux/srcutree.h | 16 +--------------- 3 files changed, 4 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 2982b5a6930f..41e27c1d917d 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -297,7 +297,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * struct srcu_ctr __percpu *retval; RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); - srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); retval = __srcu_read_lock_fast(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; @@ -312,7 +312,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_ { struct srcu_ctr __percpu *retval; - srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); retval = __srcu_read_lock_fast(ssp); return retval; } @@ -333,7 +333,7 @@ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct * { WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_down_read_fast()."); - srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); return __srcu_read_lock_fast(ssp); } diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 92e6ab53398f..1ecc3393fb26 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -112,7 +112,6 @@ static inline void srcu_barrier(struct srcu_struct *ssp) static inline void srcu_expedite_current(struct srcu_struct *ssp) { } #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) -#define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0) /* Defined here to avoid size increase for non-torture kernels. */ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 7ff4a11bc5a3..6080a9094618 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -307,21 +307,7 @@ __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -// Record reader usage even for CONFIG_PROVE_RCU=n kernels. This is -// needed only for flavors that require grace-period smp_mb() calls to be -// promoted to synchronize_rcu(). -static inline void srcu_check_read_flavor_force(struct srcu_struct *ssp, int read_flavor) -{ - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - - if (likely(READ_ONCE(sdp->srcu_reader_flavor) & read_flavor)) - return; - - // Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered. - __srcu_check_read_flavor(ssp, read_flavor); -} - -// Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. +// Record SRCU-reader usage type only for CONFIG_PROVE_RCU=y kernels. static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { if (IS_ENABLED(CONFIG_PROVE_RCU)) -- cgit v1.2.3 From 34e82569d59391bf7d808a558ff631c4428b026d Mon Sep 17 00:00:00 2001 From: Xuanqiang Luo Date: Wed, 5 Nov 2025 12:19:57 -0800 Subject: rcu: use WRITE_ONCE() for ->next and ->pprev of hlist_nulls In rculist_nulls.h we can still see ordinary assignments to ->pprev and ->next of hlist_nulls. As noted in the two patches below: commit efd04f8a8b45 ("rcu: Use WRITE_ONCE() for assignments to ->next for rculist_nulls") commit 860c8802ace1 ("rcu: Use WRITE_ONCE() for assignments to ->pprev for hlist_nulls") We should use WRITE_ONCE(). Signed-off-by: Xuanqiang Luo Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- include/linux/rculist_nulls.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 89186c499dd4..d5a656cc4c6a 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -138,7 +138,7 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, if (last) { WRITE_ONCE(n->next, last->next); - n->pprev = &last->next; + WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { hlist_nulls_add_head_rcu(n, h); @@ -148,8 +148,8 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, /* after that hlist_nulls_del will work */ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) { - n->pprev = &n->next; - n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); + WRITE_ONCE(n->pprev, &n->next); + WRITE_ONCE(n->next, (struct hlist_nulls_node *)NULLS_MARKER(NULL)); } /** -- cgit v1.2.3 From 20a0bc10272fa17a44fc857c31574a8306f60d20 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 4 Nov 2025 22:54:03 +0100 Subject: x86/fgraph,bpf: Fix stack ORC unwind from kprobe_multi return probe Currently we don't get stack trace via ORC unwinder on top of fgraph exit handler. We can see that when generating stacktrace from kretprobe_multi bpf program which is based on fprobe/fgraph. The reason is that the ORC unwind code won't get pass the return_to_handler callback installed by fgraph return probe machinery. Solving this by creating stack frame in return_to_handler expected by ftrace_graph_ret_addr function to recover original return address and continue with the unwind. Also updating the pt_regs data with cs/flags/rsp which are needed for successful stack retrieval from ebpf bpf_get_stackid helper. - in get_perf_callchain we check user_mode(regs) so CS has to be set - in perf_callchain_kernel we call perf_hw_regs(regs), so EFLAGS/FIXED has to be unset Acked-by: Masami Hiramatsu (Google) Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20251104215405.168643-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov Acked-by: Steven Rostedt (Google) --- arch/x86/include/asm/ftrace.h | 5 +++++ arch/x86/kernel/ftrace_64.S | 8 +++++++- include/linux/ftrace.h | 10 +++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 93156ac4ffe0..b08c95872eed 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -56,6 +56,11 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) return &arch_ftrace_regs(fregs)->regs; } +#define arch_ftrace_partial_regs(regs) do { \ + regs->flags &= ~X86_EFLAGS_FIXED; \ + regs->cs = __KERNEL_CS; \ +} while (0) + #define arch_ftrace_fill_perf_regs(fregs, _regs) do { \ (_regs)->ip = arch_ftrace_regs(fregs)->regs.ip; \ (_regs)->sp = arch_ftrace_regs(fregs)->regs.sp; \ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 367da3638167..823dbdd0eb41 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -354,12 +354,17 @@ SYM_CODE_START(return_to_handler) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR + /* Restore return_to_handler value that got eaten by previous ret instruction. */ + subq $8, %rsp + UNWIND_HINT_FUNC + /* Save ftrace_regs for function exit context */ subq $(FRAME_SIZE), %rsp movq %rax, RAX(%rsp) movq %rdx, RDX(%rsp) movq %rbp, RBP(%rsp) + movq %rsp, RSP(%rsp) movq %rsp, %rdi call ftrace_return_to_handler @@ -368,7 +373,8 @@ SYM_CODE_START(return_to_handler) movq RDX(%rsp), %rdx movq RAX(%rsp), %rax - addq $(FRAME_SIZE), %rsp + addq $(FRAME_SIZE) + 8, %rsp + /* * Jump back to the old return address. This cannot be JMP_NOSPEC rdi * since IBT would demand that contain ENDBR, which simply isn't so for diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7ded7df6e9b5..07f8c309e432 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -193,6 +193,10 @@ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs #if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) || \ defined(CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS) +#ifndef arch_ftrace_partial_regs +#define arch_ftrace_partial_regs(regs) do {} while (0) +#endif + static __always_inline struct pt_regs * ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs) { @@ -202,7 +206,11 @@ ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs) * Since arch_ftrace_get_regs() will check some members and may return * NULL, we can not use it. */ - return &arch_ftrace_regs(fregs)->regs; + regs = &arch_ftrace_regs(fregs)->regs; + + /* Allow arch specific updates to regs. */ + arch_ftrace_partial_regs(regs); + return regs; } #endif /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS || CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */ -- cgit v1.2.3 From b4ce5923e780d6896d4aaf19de5a27652b8bf1ea Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 5 Nov 2025 09:03:59 +0000 Subject: bpf, x86: add new map type: instructions array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On bpf(BPF_PROG_LOAD) syscall user-supplied BPF programs are translated by the verifier into "xlated" BPF programs. During this process the original instructions offsets might be adjusted and/or individual instructions might be replaced by new sets of instructions, or deleted. Add a new BPF map type which is aimed to keep track of how, for a given program, the original instructions were relocated during the verification. Also, besides keeping track of the original -> xlated mapping, make x86 JIT to build the xlated -> jitted mapping for every instruction listed in an instruction array. This is required for every future application of instruction arrays: static keys, indirect jumps and indirect calls. A map of the BPF_MAP_TYPE_INSN_ARRAY type must be created with a u32 keys and value of size 8. The values have different semantics for userspace and for BPF space. For userspace a value consists of two u32 values – xlated and jitted offsets. For BPF side the value is a real pointer to a jitted instruction. On map creation/initialization, before loading the program, each element of the map should be initialized to point to an instruction offset within the program. Before the program load such maps should be made frozen. After the program verification xlated and jitted offsets can be read via the bpf(2) syscall. If a tracked instruction is removed by the verifier, then the xlated offset is set to (u32)-1 which is considered to be too big for a valid BPF program offset. One such a map can, obviously, be used to track one and only one BPF program. If the verification process was unsuccessful, then the same map can be re-used to verify the program with a different log level. However, if the program was loaded fine, then such a map, being frozen in any case, can't be reused by other programs even after the program release. Example. Consider the following original and xlated programs: Original prog: Xlated prog: 0: r1 = 0x0 0: r1 = 0 1: *(u32 *)(r10 - 0x4) = r1 1: *(u32 *)(r10 -4) = r1 2: r2 = r10 2: r2 = r10 3: r2 += -0x4 3: r2 += -4 4: r1 = 0x0 ll 4: r1 = map[id:88] 6: call 0x1 6: r1 += 272 7: r0 = *(u32 *)(r2 +0) 8: if r0 >= 0x1 goto pc+3 9: r0 <<= 3 10: r0 += r1 11: goto pc+1 12: r0 = 0 7: r6 = r0 13: r6 = r0 8: if r6 == 0x0 goto +0x2 14: if r6 == 0x0 goto pc+4 9: call 0x76 15: r0 = 0xffffffff8d2079c0 17: r0 = *(u64 *)(r0 +0) 10: *(u64 *)(r6 + 0x0) = r0 18: *(u64 *)(r6 +0) = r0 11: r0 = 0x0 19: r0 = 0x0 12: exit 20: exit An instruction array map, containing, e.g., instructions [0,4,7,12] will be translated by the verifier to [0,4,13,20]. A map with index 5 (the middle of 16-byte instruction) or indexes greater than 12 (outside the program boundaries) would be rejected. The functionality provided by this patch will be extended in consequent patches to implement BPF Static Keys, indirect jumps, and indirect calls. Signed-off-by: Anton Protopopov Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251105090410.1250500-2-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 9 ++ include/linux/bpf.h | 15 +++ include/linux/bpf_types.h | 1 + include/linux/bpf_verifier.h | 2 + include/uapi/linux/bpf.h | 21 +++ kernel/bpf/Makefile | 2 +- kernel/bpf/bpf_insn_array.c | 286 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 22 ++++ kernel/bpf/verifier.c | 45 +++++++ tools/include/uapi/linux/bpf.h | 21 +++ 10 files changed, 423 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_insn_array.c (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index de5083cb1d37..91f92d65ae83 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3827,6 +3827,15 @@ out_image: jit_data->header = header; jit_data->rw_header = rw_header; } + + /* + * The bpf_prog_update_insn_ptrs function expects addrs to + * point to the first byte of the jitted instruction (unlike + * the bpf_prog_fill_jited_linfo below, which, for historical + * reasons, expects to point to the next instruction) + */ + bpf_prog_update_insn_ptrs(prog, addrs, image); + /* * ctx.prog_offset is used when CFI preambles put code *before* * the function. See emit_cfi(). For FineIBT specifically this code diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a47d67db3be5..9d41a6affcef 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3797,4 +3797,19 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * const char **linep, int *nump); struct bpf_prog *bpf_prog_find_from_stack(void); +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog); +int bpf_insn_array_ready(struct bpf_map *map); +void bpf_insn_array_release(struct bpf_map *map); +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len); +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len); + +#ifdef CONFIG_BPF_SYSCALL +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image); +#else +static inline void +bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ +} +#endif + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa78f49d4a9a..b13de31e163f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -133,6 +133,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c6eb68b6389c..6b820d8d77af 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -754,8 +754,10 @@ struct bpf_verifier_env { struct list_head free_list; /* list of struct bpf_verifier_state_list */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ + struct bpf_map *insn_array_maps[MAX_USED_MAPS]; /* array of INSN_ARRAY map's to be relocated */ u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ + u32 insn_array_map_cnt; /* number of used maps of type BPF_MAP_TYPE_INSN_ARRAY */ u32 id_gen; /* used to generate unique reg IDs */ u32 hidden_subprog_cnt; /* number of hidden subprogs */ int exception_callback_subprog; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d73f165394d..f5713f59ac10 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1026,6 +1026,7 @@ enum bpf_map_type { BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, + BPF_MAP_TYPE_INSN_ARRAY, __MAX_BPF_MAP_TYPE }; @@ -7649,4 +7650,24 @@ enum bpf_kfunc_flags { BPF_F_PAD_ZEROS = (1ULL << 0), }; +/* + * Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type. + * + * Before the map is used the orig_off field should point to an + * instruction inside the program being loaded. The other fields + * must be set to 0. + * + * After the program is loaded, the xlated_off will be adjusted + * by the verifier to point to the index of the original instruction + * in the xlated program. If the instruction is deleted, it will + * be set to (u32)-1. The jitted_off will be set to the corresponding + * offset in the jitted image of the program. + */ +struct bpf_insn_array_value { + __u32 orig_off; + __u32 xlated_off; + __u32 jitted_off; + __u32 :32; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 7fd0badfacb1..232cbc97434d 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,7 +9,7 @@ CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c new file mode 100644 index 000000000000..2053fda377bb --- /dev/null +++ b/kernel/bpf/bpf_insn_array.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Isovalent */ + +#include + +struct bpf_insn_array { + struct bpf_map map; + atomic_t used; + long *ips; + DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values); +}; + +#define cast_insn_array(MAP_PTR) \ + container_of((MAP_PTR), struct bpf_insn_array, map) + +#define INSN_DELETED ((u32)-1) + +static inline u64 insn_array_alloc_size(u32 max_entries) +{ + const u64 base_size = sizeof(struct bpf_insn_array); + const u64 entry_size = sizeof(struct bpf_insn_array_value); + + return base_size + max_entries * (entry_size + sizeof(long)); +} + +static int insn_array_alloc_check(union bpf_attr *attr) +{ + u32 value_size = sizeof(struct bpf_insn_array_value); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != value_size || attr->map_flags != 0) + return -EINVAL; + + return 0; +} + +static void insn_array_free(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + bpf_map_area_free(insn_array); +} + +static struct bpf_map *insn_array_alloc(union bpf_attr *attr) +{ + u64 size = insn_array_alloc_size(attr->max_entries); + struct bpf_insn_array *insn_array; + + insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE); + if (!insn_array) + return ERR_PTR(-ENOMEM); + + /* ips are allocated right after the insn_array->values[] array */ + insn_array->ips = (void *)&insn_array->values[attr->max_entries]; + + bpf_map_init_from_attr(&insn_array->map, attr); + + return &insn_array->map; +} + +static void *insn_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= insn_array->map.max_entries)) + return NULL; + + return &insn_array->values[index]; +} + +static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + struct bpf_insn_array_value val = {}; + + if (unlikely(index >= insn_array->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags & BPF_NOEXIST)) + return -EEXIST; + + copy_map_value(map, &val, value); + if (val.jitted_off || val.xlated_off) + return -EINVAL; + + insn_array->values[index].orig_off = val.orig_off; + + return 0; +} + +static long insn_array_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +static int insn_array_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + if (!btf_type_is_i32(key_type)) + return -EINVAL; + + if (!btf_type_is_i64(value_type)) + return -EINVAL; + + return 0; +} + +static u64 insn_array_mem_usage(const struct bpf_map *map) +{ + return insn_array_alloc_size(map->max_entries); +} + +BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array) + +const struct bpf_map_ops insn_array_map_ops = { + .map_alloc_check = insn_array_alloc_check, + .map_alloc = insn_array_alloc, + .map_free = insn_array_free, + .map_get_next_key = bpf_array_get_next_key, + .map_lookup_elem = insn_array_lookup_elem, + .map_update_elem = insn_array_update_elem, + .map_delete_elem = insn_array_delete_elem, + .map_check_btf = insn_array_check_btf, + .map_mem_usage = insn_array_mem_usage, + .map_btf_id = &insn_array_btf_ids[0], +}; + +static inline bool is_frozen(struct bpf_map *map) +{ + guard(mutex)(&map->freeze_mutex); + + return map->frozen; +} + +static bool is_insn_array(const struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_INSN_ARRAY; +} + +static inline bool valid_offsets(const struct bpf_insn_array *insn_array, + const struct bpf_prog *prog) +{ + u32 off; + int i; + + for (i = 0; i < insn_array->map.max_entries; i++) { + off = insn_array->values[i].orig_off; + + if (off >= prog->len) + return false; + + if (off > 0) { + if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM)) + return false; + } + } + + return true; +} + +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + struct bpf_insn_array_value *values = insn_array->values; + int i; + + if (!is_frozen(map)) + return -EINVAL; + + if (!valid_offsets(insn_array, prog)) + return -EINVAL; + + /* + * There can be only one program using the map + */ + if (atomic_xchg(&insn_array->used, 1)) + return -EBUSY; + + /* + * Reset all the map indexes to the original values. This is needed, + * e.g., when a replay of verification with different log level should + * be performed. + */ + for (i = 0; i < map->max_entries; i++) + values[i].xlated_off = values[i].orig_off; + + return 0; +} + +int bpf_insn_array_ready(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (!insn_array->ips[i]) + return -EFAULT; + } + + return 0; +} + +void bpf_insn_array_release(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + atomic_set(&insn_array->used, 0); +} + +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + if (len <= 1) + return; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off <= off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + insn_array->values[i].xlated_off += len - 1; + } +} + +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off < off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (insn_array->values[i].xlated_off < off + len) + insn_array->values[i].xlated_off = INSN_DELETED; + else + insn_array->values[i].xlated_off -= len; + } +} + +/* + * This function is called by JITs. The image is the real program + * image, the offsets array set up the xlated -> jitted mapping. + * The offsets[xlated] offset should point to the beginning of + * the jitted instruction. + */ +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ + struct bpf_insn_array *insn_array; + struct bpf_map *map; + u32 xlated_off; + int i, j; + + if (!offsets || !image) + return; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (!is_insn_array(map)) + continue; + + insn_array = cast_insn_array(map); + for (j = 0; j < map->max_entries; j++) { + xlated_off = insn_array->values[j].xlated_off; + if (xlated_off == INSN_DELETED) + continue; + if (xlated_off < prog->aux->subprog_start) + continue; + xlated_off -= prog->aux->subprog_start; + if (xlated_off >= prog->len) + continue; + + insn_array->values[j].jitted_off = offsets[xlated_off]; + insn_array->ips[j] = (long)(image + offsets[xlated_off]); + } + } +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a129746bd6c..f62d61b6730a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1493,6 +1493,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: if (!bpf_token_capable(token, CAP_BPF)) goto put_token; break; @@ -2853,6 +2854,23 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr return err; } +static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) +{ + int err; + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY) + continue; + + err = bpf_insn_array_ready(prog->aux->used_maps[i]); + if (err) + return err; + } + + return 0; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id @@ -3082,6 +3100,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_used_maps; + err = bpf_prog_mark_insn_arrays_ready(prog); + if (err < 0) + goto free_used_maps; + err = bpf_prog_alloc_id(prog); if (err) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e4928846e763..dfe5741812b9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10086,6 +10086,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_INSN_ARRAY: + goto error; default: break; } @@ -20582,6 +20584,15 @@ static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map) env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + err = bpf_insn_array_init(map, env->prog); + if (err) { + verbose(env, "Failed to properly initialize insn array\n"); + return err; + } + env->insn_array_maps[env->insn_array_map_cnt++] = map; + } + return env->used_map_cnt - 1; } @@ -20828,6 +20839,33 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void release_insn_arrays(struct bpf_verifier_env *env) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_release(env->insn_array_maps[i]); +} + +static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust(env->insn_array_maps[i], off, len); +} + +static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len); +} + static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; @@ -20869,6 +20907,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of } adjust_insn_aux_data(env, new_prog, off, len); adjust_subprog_starts(env, off, len); + adjust_insn_arrays(env, off, len); adjust_poke_descs(new_prog, off, len); return new_prog; } @@ -21052,6 +21091,8 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (err) return err; + adjust_insn_arrays_after_remove(env, off, cnt); + memmove(aux_data + off, aux_data + off + cnt, sizeof(*aux_data) * (orig_prog_len - off - cnt)); @@ -21695,6 +21736,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; func[i]->aux->arena = prog->aux->arena; + func[i]->aux->used_maps = env->used_maps; + func[i]->aux->used_map_cnt = env->used_map_cnt; num_exentries = 0; insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { @@ -24871,6 +24914,8 @@ skip_full_check: adjust_btf_func(env); err_release_maps: + if (ret) + release_insn_arrays(env); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_used_maps() will release them. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1d73f165394d..f5713f59ac10 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1026,6 +1026,7 @@ enum bpf_map_type { BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, + BPF_MAP_TYPE_INSN_ARRAY, __MAX_BPF_MAP_TYPE }; @@ -7649,4 +7650,24 @@ enum bpf_kfunc_flags { BPF_F_PAD_ZEROS = (1ULL << 0), }; +/* + * Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type. + * + * Before the map is used the orig_off field should point to an + * instruction inside the program being loaded. The other fields + * must be set to 0. + * + * After the program is loaded, the xlated_off will be adjusted + * by the verifier to point to the index of the original instruction + * in the xlated program. If the instruction is deleted, it will + * be set to (u32)-1. The jitted_off will be set to the corresponding + * offset in the jitted image of the program. + */ +struct bpf_insn_array_value { + __u32 orig_off; + __u32 xlated_off; + __u32 jitted_off; + __u32 :32; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From b1d16f7c0063b7209fd3251ce40c77d37b477b83 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 4 Nov 2025 09:23:31 -0800 Subject: libie: depend on DEBUG_FS when building LIBIE_FWLOG LIBIE_FWLOG is unusable without DEBUG_FS. Mark it in Kconfig. Fix build error on ixgbe when DEBUG_FS is not set. To not add another layer of #if IS_ENABLED(LIBIE_FWLOG) in ixgbe fwlog code define debugfs dentry even when DEBUG_FS isn't enabled. In this case the dummy functions of LIBIE_FWLOG will be used, so not initialized dentry isn't a problem. Fixes: 641585bc978e ("ixgbe: fwlog support for e610") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/lkml/f594c621-f9e1-49f2-af31-23fbcb176058@roeck-us.net/ Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Reviewed-by: Aleksandr Loktionov Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20251104172333.752445-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/Kconfig | 4 ++-- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 2 -- include/linux/net/intel/libie/fwlog.h | 12 ++++++++++++ 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index a563a94e2780..122ee23497e6 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -146,7 +146,7 @@ config IXGBE tristate "Intel(R) 10GbE PCI Express adapters support" depends on PCI depends on PTP_1588_CLOCK_OPTIONAL - select LIBIE_FWLOG + select LIBIE_FWLOG if DEBUG_FS select MDIO select NET_DEVLINK select PLDMFW @@ -298,7 +298,7 @@ config ICE select DIMLIB select LIBIE select LIBIE_ADMINQ - select LIBIE_FWLOG + select LIBIE_FWLOG if DEBUG_FS select NET_DEVLINK select PACKING select PLDMFW diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 14d275270123..dce4936708eb 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -821,9 +821,7 @@ struct ixgbe_adapter { #ifdef CONFIG_IXGBE_HWMON struct hwmon_buff *ixgbe_hwmon_buff; #endif /* CONFIG_IXGBE_HWMON */ -#ifdef CONFIG_DEBUG_FS struct dentry *ixgbe_dbg_adapter; -#endif /*CONFIG_DEBUG_FS*/ u8 default_up; /* Bitmask indicating in use pools */ diff --git a/include/linux/net/intel/libie/fwlog.h b/include/linux/net/intel/libie/fwlog.h index 36b13fabca9e..7273c78c826b 100644 --- a/include/linux/net/intel/libie/fwlog.h +++ b/include/linux/net/intel/libie/fwlog.h @@ -78,8 +78,20 @@ struct libie_fwlog { ); }; +#if IS_ENABLED(CONFIG_LIBIE_FWLOG) int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api); void libie_fwlog_deinit(struct libie_fwlog *fwlog); void libie_fwlog_reregister(struct libie_fwlog *fwlog); void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len); +#else +static inline int libie_fwlog_init(struct libie_fwlog *fwlog, + struct libie_fwlog_api *api) +{ + return -EOPNOTSUPP; +} +static inline void libie_fwlog_deinit(struct libie_fwlog *fwlog) { } +static inline void libie_fwlog_reregister(struct libie_fwlog *fwlog) { } +static inline void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, + u16 len) { } +#endif /* CONFIG_LIBIE_FWLOG */ #endif /* _LIBIE_FWLOG_H_ */ -- cgit v1.2.3 From 493d9e0d608339a32f568504d5fd411a261bb0af Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 5 Nov 2025 09:04:06 +0000 Subject: bpf, x86: add support for indirect jumps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for a new instruction BPF_JMP|BPF_X|BPF_JA, SRC=0, DST=Rx, off=0, imm=0 which does an indirect jump to a location stored in Rx. The register Rx should have type PTR_TO_INSN. This new type assures that the Rx register contains a value (or a range of values) loaded from a correct jump table – map of type instruction array. For example, for a C switch LLVM will generate the following code: 0: r3 = r1 # "switch (r3)" 1: if r3 > 0x13 goto +0x666 # check r3 boundaries 2: r3 <<= 0x3 # adjust to an index in array of addresses 3: r1 = 0xbeef ll # r1 is PTR_TO_MAP_VALUE, r1->map_ptr=M 5: r1 += r3 # r1 inherits boundaries from r3 6: r1 = *(u64 *)(r1 + 0x0) # r1 now has type INSN_TO_PTR 7: gotox r1 # jit will generate proper code Here the gotox instruction corresponds to one particular map. This is possible however to have a gotox instruction which can be loaded from different maps, e.g. 0: r1 &= 0x1 1: r2 <<= 0x3 2: r3 = 0x0 ll # load from map M_1 4: r3 += r2 5: if r1 == 0x0 goto +0x4 6: r1 <<= 0x3 7: r3 = 0x0 ll # load from map M_2 9: r3 += r1 A: r1 = *(u64 *)(r3 + 0x0) B: gotox r1 # jump to target loaded from M_1 or M_2 During check_cfg stage the verifier will collect all the maps which point to inside the subprog being verified. When building the config, the high 16 bytes of the insn_state are used, so this patch (theoretically) supports jump tables of up to 2^16 slots. During the later stage, in check_indirect_jump, it is checked that the register Rx was loaded from a particular instruction array. Signed-off-by: Anton Protopopov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251105090410.1250500-9-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 3 + include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 9 ++ kernel/bpf/bpf_insn_array.c | 15 ++ kernel/bpf/core.c | 1 + kernel/bpf/liveness.c | 3 + kernel/bpf/log.c | 1 + kernel/bpf/verifier.c | 373 ++++++++++++++++++++++++++++++++++++++++++- 8 files changed, 400 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index bbd2b03d2b74..36a0d4db9f68 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2628,6 +2628,9 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ break; + case BPF_JMP | BPF_JA | BPF_X: + emit_indirect_jump(&prog, insn->dst_reg, image + addrs[i - 1]); + break; case BPF_JMP | BPF_JA: case BPF_JMP32 | BPF_JA: if (BPF_CLASS(insn->code) == BPF_JMP) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9d41a6affcef..09d5dc541d1c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1001,6 +1001,7 @@ enum bpf_reg_type { PTR_TO_ARENA, PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_INSN, /* reg points to a bpf program instruction */ CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6b820d8d77af..5441341f1ab9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -527,6 +527,7 @@ struct bpf_insn_aux_data { struct { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ + struct bpf_iarray *jt; /* jump table for gotox instruction */ }; struct { enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ @@ -840,6 +841,7 @@ struct bpf_verifier_env { struct bpf_scc_info **scc_info; u32 scc_cnt; struct bpf_iarray *succ; + struct bpf_iarray *gotox_tmp_buf; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) @@ -1050,6 +1052,13 @@ static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_ return !(off % BPF_REG_SIZE); } +static inline bool insn_is_gotox(struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_JA && + BPF_SRC(insn->code) == BPF_X; +} + const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c index 2053fda377bb..61ce52882632 100644 --- a/kernel/bpf/bpf_insn_array.c +++ b/kernel/bpf/bpf_insn_array.c @@ -114,6 +114,20 @@ static u64 insn_array_mem_usage(const struct bpf_map *map) return insn_array_alloc_size(map->max_entries); } +static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + if ((off % sizeof(long)) != 0 || + (off / sizeof(long)) >= map->max_entries) + return -EINVAL; + + /* from BPF's point of view, this map is a jump table */ + *imm = (unsigned long)insn_array->ips + off; + + return 0; +} + BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array) const struct bpf_map_ops insn_array_map_ops = { @@ -126,6 +140,7 @@ const struct bpf_map_ops insn_array_map_ops = { .map_delete_elem = insn_array_delete_elem, .map_check_btf = insn_array_check_btf, .map_mem_usage = insn_array_mem_usage, + .map_direct_value_addr = insn_array_map_direct_value_addr, .map_btf_id = &insn_array_btf_ids[0], }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4b62a03d6df5..ef4448f18aad 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1708,6 +1708,7 @@ bool bpf_opcode_in_insntable(u8 code) [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, + [BPF_JMP | BPF_JA | BPF_X] = true, [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index bffb495bc933..a7240013fd9d 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -485,6 +485,9 @@ bpf_insn_successors(struct bpf_verifier_env *env, u32 idx) struct bpf_iarray *succ; int insn_sz; + if (unlikely(insn_is_gotox(insn))) + return env->insn_aux_data[idx].jt; + /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */ succ = env->succ; succ->cnt = 0; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 70221aafc35c..a0c3b35de2ce 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -461,6 +461,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) [PTR_TO_ARENA] = "arena", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", + [PTR_TO_INSN] = "insn", [PTR_TO_MAP_KEY] = "map_key", [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 781669f649f2..1268fa075d4c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6006,6 +6006,18 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, return 0; } +/* + * Return the size of the memory region accessible from a pointer to map value. + * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible. + */ +static u32 map_mem_size(const struct bpf_map *map) +{ + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) + return map->max_entries * sizeof(long); + + return map->value_size; +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed, @@ -6015,11 +6027,11 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; + u32 mem_size = map_mem_size(map); struct btf_record *rec; int err, i; - err = check_mem_region_access(env, regno, off, size, map->value_size, - zero_size_allowed); + err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -7481,6 +7493,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; + bool insn_array = reg->type == PTR_TO_MAP_VALUE && + reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -7488,7 +7502,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return size; /* alignment checks will add in reg->off themselves */ - err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); + err = check_ptr_alignment(env, reg, off, size, strict_alignment_once || insn_array); if (err) return err; @@ -7515,6 +7529,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } + if (t == BPF_WRITE && insn_array) { + verbose(env, "writes into insn_array not allowed\n"); + return -EACCES; + } + err = check_map_access_type(env, regno, off, size, t); if (err) return err; @@ -7543,6 +7562,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regs[value_regno].type = SCALAR_VALUE; __mark_reg_known(®s[value_regno], val); + } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + if (bpf_size != BPF_DW) { + verbose(env, "Invalid read of %d bytes from insn_array\n", + size); + return -EACCES; + } + copy_register_state(®s[value_regno], reg); + regs[value_regno].type = PTR_TO_INSN; } else { mark_reg_unknown(env, regs, value_regno); } @@ -17096,7 +17123,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; - WARN_ON_ONCE(map->max_entries != 1); + WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY && + map->max_entries != 1); /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { @@ -17864,6 +17892,206 @@ static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem) return new; } +static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items) +{ + struct bpf_insn_array_value *value; + u32 i; + + for (i = start; i <= end; i++) { + value = map->ops->map_lookup_elem(map, &i); + if (!value) + return -EINVAL; + items[i - start] = value->xlated_off; + } + return 0; +} + +static int cmp_ptr_to_u32(const void *a, const void *b) +{ + return *(u32 *)a - *(u32 *)b; +} + +static int sort_insn_array_uniq(u32 *items, int cnt) +{ + int unique = 1; + int i; + + sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL); + + for (i = 1; i < cnt; i++) + if (items[i] != items[unique - 1]) + items[unique++] = items[i]; + + return unique; +} + +/* + * sort_unique({map[start], ..., map[end]}) into off + */ +static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off) +{ + u32 n = end - start + 1; + int err; + + err = copy_insn_array(map, start, end, off); + if (err) + return err; + + return sort_insn_array_uniq(off, n); +} + +/* + * Copy all unique offsets from the map + */ +static struct bpf_iarray *jt_from_map(struct bpf_map *map) +{ + struct bpf_iarray *jt; + int err; + int n; + + jt = iarray_realloc(NULL, map->max_entries); + if (!jt) + return ERR_PTR(-ENOMEM); + + n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items); + if (n < 0) { + err = n; + goto err_free; + } + if (n == 0) { + err = -EINVAL; + goto err_free; + } + jt->cnt = n; + return jt; + +err_free: + kvfree(jt); + return ERR_PTR(err); +} + +/* + * Find and collect all maps which fit in the subprog. Return the result as one + * combined jump table in jt->items (allocated with kvcalloc) + */ +static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env, + int subprog_start, int subprog_end) +{ + struct bpf_iarray *jt = NULL; + struct bpf_map *map; + struct bpf_iarray *jt_cur; + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) { + /* + * TODO (when needed): collect only jump tables, not static keys + * or maps for indirect calls + */ + map = env->insn_array_maps[i]; + + jt_cur = jt_from_map(map); + if (IS_ERR(jt_cur)) { + kvfree(jt); + return jt_cur; + } + + /* + * This is enough to check one element. The full table is + * checked to fit inside the subprog later in create_jt() + */ + if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) { + u32 old_cnt = jt ? jt->cnt : 0; + jt = iarray_realloc(jt, old_cnt + jt_cur->cnt); + if (!jt) { + kvfree(jt_cur); + return ERR_PTR(-ENOMEM); + } + memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2); + } + + kvfree(jt_cur); + } + + if (!jt) { + verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start); + return ERR_PTR(-EINVAL); + } + + jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt); + return jt; +} + +static struct bpf_iarray * +create_jt(int t, struct bpf_verifier_env *env) +{ + static struct bpf_subprog_info *subprog; + int subprog_start, subprog_end; + struct bpf_iarray *jt; + int i; + + subprog = bpf_find_containing_subprog(env, t); + subprog_start = subprog->start; + subprog_end = (subprog + 1)->start; + jt = jt_from_subprog(env, subprog_start, subprog_end); + if (IS_ERR(jt)) + return jt; + + /* Check that the every element of the jump table fits within the given subprogram */ + for (i = 0; i < jt->cnt; i++) { + if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) { + verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n", + t, subprog_start, subprog_end); + kvfree(jt); + return ERR_PTR(-EINVAL); + } + } + + return jt; +} + +/* "conditional jump with N edges" */ +static int visit_gotox_insn(int t, struct bpf_verifier_env *env) +{ + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + bool keep_exploring = false; + struct bpf_iarray *jt; + int i, w; + + jt = env->insn_aux_data[t].jt; + if (!jt) { + jt = create_jt(t, env); + if (IS_ERR(jt)) + return PTR_ERR(jt); + + env->insn_aux_data[t].jt = jt; + } + + mark_prune_point(env, t); + for (i = 0; i < jt->cnt; i++) { + w = jt->items[i]; + if (w < 0 || w >= env->prog->len) { + verbose(env, "indirect jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + mark_jmp_point(env, w); + + /* EXPLORED || DISCOVERED */ + if (insn_state[w]) + continue; + + if (env->cfg.cur_stack >= env->prog->len) + return -E2BIG; + + insn_stack[env->cfg.cur_stack++] = w; + insn_state[w] |= DISCOVERED; + keep_exploring = true; + } + + return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -17956,8 +18184,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: - if (BPF_SRC(insn->code) != BPF_K) - return -EINVAL; + if (BPF_SRC(insn->code) == BPF_X) + return visit_gotox_insn(t, env); if (BPF_CLASS(insn->code) == BPF_JMP) off = insn->off; @@ -18886,6 +19114,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; case PTR_TO_ARENA: return true; + case PTR_TO_INSN: + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && + rold->off == rcur->off && range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); default: return regs_exact(rold, rcur, idmap); } @@ -19895,6 +20127,99 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, return PROCESS_BPF_EXIT; } +static int indirect_jump_min_max_index(struct bpf_verifier_env *env, + int regno, + struct bpf_map *map, + u32 *pmin_index, u32 *pmax_index) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + u64 min_index, max_index; + const u32 size = 8; + + if (check_add_overflow(reg->umin_value, reg->off, &min_index) || + (min_index > (u64) U32_MAX * size)) { + verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n", + regno, reg->umin_value, reg->off); + return -ERANGE; + } + if (check_add_overflow(reg->umax_value, reg->off, &max_index) || + (max_index > (u64) U32_MAX * size)) { + verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n", + regno, reg->umax_value, reg->off); + return -ERANGE; + } + + min_index /= size; + max_index /= size; + + if (max_index >= map->max_entries) { + verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n", + regno, min_index, max_index, map->max_entries); + return -EINVAL; + } + + *pmin_index = min_index; + *pmax_index = max_index; + return 0; +} + +/* gotox *dst_reg */ +static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *dst_reg; + struct bpf_map *map; + u32 min_index, max_index; + int err = 0; + int n; + int i; + + dst_reg = reg_state(env, insn->dst_reg); + if (dst_reg->type != PTR_TO_INSN) { + verbose(env, "R%d has type %s, expected PTR_TO_INSN\n", + insn->dst_reg, reg_type_str(env, dst_reg->type)); + return -EINVAL; + } + + map = dst_reg->map_ptr; + if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg)) + return -EFAULT; + + if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env, + "R%d has incorrect map type %d", insn->dst_reg, map->map_type)) + return -EFAULT; + + err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index); + if (err) + return err; + + /* Ensure that the buffer is large enough */ + if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) { + env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf, + max_index - min_index + 1); + if (!env->gotox_tmp_buf) + return -ENOMEM; + } + + n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items); + if (n < 0) + return n; + if (n == 0) { + verbose(env, "register R%d doesn't point to any offset in map id=%d\n", + insn->dst_reg, map->id); + return -EINVAL; + } + + for (i = 0; i < n - 1; i++) { + other_branch = push_stack(env, env->gotox_tmp_buf->items[i], + env->insn_idx, env->cur_state->speculative); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); + } + env->insn_idx = env->gotox_tmp_buf->items[n-1]; + return 0; +} + static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) { int err; @@ -19997,6 +20322,15 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) mark_reg_scratched(env, BPF_REG_0); } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->src_reg != BPF_REG_0 || + insn->imm != 0 || insn->off != 0) { + verbose(env, "BPF_JA|BPF_X uses reserved fields\n"); + return -EINVAL; + } + return check_indirect_jump(env, insn); + } + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || @@ -20513,6 +20847,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: break; default: verbose(env, @@ -21070,6 +21405,27 @@ static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, return 0; } +/* + * Clean up dynamically allocated fields of aux data for instructions [start, ...] + */ +static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int end = start + len; + int i; + + for (i = start; i < end; i++) { + if (insn_is_gotox(&insns[i])) { + kvfree(aux_data[i].jt); + aux_data[i].jt = NULL; + } + + if (bpf_is_ldimm64(&insns[i])) + i++; + } +} + static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; @@ -21079,6 +21435,9 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_remove_insns(env, off, cnt); + /* Should be called before bpf_remove_insns, as it uses prog->insnsi */ + clear_insn_aux_data(env, off, cnt); + err = bpf_remove_insns(env->prog, off, cnt); if (err) return err; @@ -24945,12 +25304,14 @@ err_release_maps: err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); + clear_insn_aux_data(env, 0, env->prog->len); vfree(env->insn_aux_data); err_free_env: bpf_stack_liveness_free(env); kvfree(env->cfg.insn_postorder); kvfree(env->scc_info); kvfree(env->succ); + kvfree(env->gotox_tmp_buf); kvfree(env); return ret; } -- cgit v1.2.3 From b340412a3b22b60b5e19cce8726940c7b5b14439 Mon Sep 17 00:00:00 2001 From: James Calligeros Date: Sat, 25 Oct 2025 10:24:36 +1000 Subject: mfd: macsmc: Add new __SMC_KEY macro When using the _SMC_KEY macro in switch/case statements, GCC 15.2.1 errors out with 'case label does not reduce to an integer constant'. Introduce a new __SMC_KEY macro that can be used instead. Signed-off-by: James Calligeros Link: https://patch.msgid.link/20251025-macsmc-subdevs-v4-5-374d5c9eba0e@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/macsmc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/macsmc.h b/include/linux/mfd/macsmc.h index 6b13f01a8592..f6f80c33b5cf 100644 --- a/include/linux/mfd/macsmc.h +++ b/include/linux/mfd/macsmc.h @@ -41,6 +41,7 @@ typedef u32 smc_key; */ #define SMC_KEY(s) (smc_key)(_SMC_KEY(#s)) #define _SMC_KEY(s) (((s)[0] << 24) | ((s)[1] << 16) | ((s)[2] << 8) | (s)[3]) +#define __SMC_KEY(a, b, c, d) (((u32)(a) << 24) | ((u32)(b) << 16) | ((u32)(c) << 8) | ((u32)(d))) #define APPLE_SMC_READABLE BIT(7) #define APPLE_SMC_WRITABLE BIT(6) -- cgit v1.2.3 From d306cbbc34cc9aa6ed2235472110fe797f887db7 Mon Sep 17 00:00:00 2001 From: Atharva Tiwari Date: Tue, 7 Oct 2025 18:35:10 +0530 Subject: mfd: macsmc: Make SMC write buffers const Mark the write buffer arguments in apple_smc_write(), apple_smc_rw(), and apple_smc_write_atomic() as const. These functions do not modify the data provided by the caller, so the parameters should be const qualified. Signed-off-by: Atharva Tiwari Reviewed-by: Sven Peter Signed-off-by: Lee Jones --- drivers/mfd/macsmc.c | 6 +++--- include/linux/mfd/macsmc.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/mfd/macsmc.c b/drivers/mfd/macsmc.c index e6cdae221f1d..e3893e255ce5 100644 --- a/drivers/mfd/macsmc.c +++ b/drivers/mfd/macsmc.c @@ -173,7 +173,7 @@ int apple_smc_read(struct apple_smc *smc, smc_key key, void *buf, size_t size) } EXPORT_SYMBOL(apple_smc_read); -int apple_smc_write(struct apple_smc *smc, smc_key key, void *buf, size_t size) +int apple_smc_write(struct apple_smc *smc, smc_key key, const void *buf, size_t size) { guard(mutex)(&smc->mutex); @@ -181,7 +181,7 @@ int apple_smc_write(struct apple_smc *smc, smc_key key, void *buf, size_t size) } EXPORT_SYMBOL(apple_smc_write); -int apple_smc_rw(struct apple_smc *smc, smc_key key, void *wbuf, size_t wsize, +int apple_smc_rw(struct apple_smc *smc, smc_key key, const void *wbuf, size_t wsize, void *rbuf, size_t rsize) { guard(mutex)(&smc->mutex); @@ -239,7 +239,7 @@ int apple_smc_enter_atomic(struct apple_smc *smc) } EXPORT_SYMBOL(apple_smc_enter_atomic); -int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, void *buf, size_t size) +int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, const void *buf, size_t size) { guard(spinlock_irqsave)(&smc->lock); u8 result; diff --git a/include/linux/mfd/macsmc.h b/include/linux/mfd/macsmc.h index f6f80c33b5cf..cc09ecce0df7 100644 --- a/include/linux/mfd/macsmc.h +++ b/include/linux/mfd/macsmc.h @@ -150,7 +150,7 @@ int apple_smc_read(struct apple_smc *smc, smc_key key, void *buf, size_t size); * * Return: Zero on success, negative errno on error */ -int apple_smc_write(struct apple_smc *smc, smc_key key, void *buf, size_t size); +int apple_smc_write(struct apple_smc *smc, smc_key key, const void *buf, size_t size); /** * apple_smc_enter_atomic - Enter atomic mode to be able to use apple_smc_write_atomic @@ -177,7 +177,7 @@ int apple_smc_enter_atomic(struct apple_smc *smc); * * Return: Zero on success, negative errno on error */ -int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, void *buf, size_t size); +int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, const void *buf, size_t size); /** * apple_smc_rw - Write and then read using the given SMC key @@ -190,7 +190,7 @@ int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, void *buf, size_t * * Return: Zero on success, negative errno on error */ -int apple_smc_rw(struct apple_smc *smc, smc_key key, void *wbuf, size_t wsize, +int apple_smc_rw(struct apple_smc *smc, smc_key key, const void *wbuf, size_t wsize, void *rbuf, size_t rsize); /** -- cgit v1.2.3 From 617347e716178d3a317a129ece05116967f06d53 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 25 Jun 2025 14:32:58 +0100 Subject: mfd: wl1273-core: Remove the header The wl1273 FM radio is on Arnd's unused driver list: https://lore.kernel.org/lkml/a15bb180-401d-49ad-a212-0c81d613fbc8@app.fastmail.com/ Other patches have removed the core, the ASoC code and the Radio code. With all those in, remove the header. Also, tidy the ref in the docs. Signed-off-by: Dr. David Alan Gilbert Acked-by: Arnd Bergmann Signed-off-by: Lee Jones --- Documentation/admin-guide/media/radio-cardlist.rst | 1 - include/linux/mfd/wl1273-core.h | 277 --------------------- 2 files changed, 278 deletions(-) delete mode 100644 include/linux/mfd/wl1273-core.h (limited to 'include/linux') diff --git a/Documentation/admin-guide/media/radio-cardlist.rst b/Documentation/admin-guide/media/radio-cardlist.rst index a82a146bf912..cec724256812 100644 --- a/Documentation/admin-guide/media/radio-cardlist.rst +++ b/Documentation/admin-guide/media/radio-cardlist.rst @@ -30,7 +30,6 @@ radio-terratec TerraTec ActiveRadio ISA Standalone radio-timb Enable the Timberdale radio driver radio-trust Trust FM radio card radio-typhoon Typhoon Radio (a.k.a. EcoRadio) -radio-wl1273 Texas Instruments WL1273 I2C FM Radio fm_drv ISA radio devices fm_drv ISA radio devices radio-zoltrix Zoltrix Radio diff --git a/include/linux/mfd/wl1273-core.h b/include/linux/mfd/wl1273-core.h deleted file mode 100644 index c28cf76d5c31..000000000000 --- a/include/linux/mfd/wl1273-core.h +++ /dev/null @@ -1,277 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * include/linux/mfd/wl1273-core.h - * - * Some definitions for the wl1273 radio receiver/transmitter chip. - * - * Copyright (C) 2010 Nokia Corporation - * Author: Matti J. Aaltonen - */ - -#ifndef WL1273_CORE_H -#define WL1273_CORE_H - -#include -#include - -#define WL1273_FM_DRIVER_NAME "wl1273-fm" -#define RX71_FM_I2C_ADDR 0x22 - -#define WL1273_STEREO_GET 0 -#define WL1273_RSSI_LVL_GET 1 -#define WL1273_IF_COUNT_GET 2 -#define WL1273_FLAG_GET 3 -#define WL1273_RDS_SYNC_GET 4 -#define WL1273_RDS_DATA_GET 5 -#define WL1273_FREQ_SET 10 -#define WL1273_AF_FREQ_SET 11 -#define WL1273_MOST_MODE_SET 12 -#define WL1273_MOST_BLEND_SET 13 -#define WL1273_DEMPH_MODE_SET 14 -#define WL1273_SEARCH_LVL_SET 15 -#define WL1273_BAND_SET 16 -#define WL1273_MUTE_STATUS_SET 17 -#define WL1273_RDS_PAUSE_LVL_SET 18 -#define WL1273_RDS_PAUSE_DUR_SET 19 -#define WL1273_RDS_MEM_SET 20 -#define WL1273_RDS_BLK_B_SET 21 -#define WL1273_RDS_MSK_B_SET 22 -#define WL1273_RDS_PI_MASK_SET 23 -#define WL1273_RDS_PI_SET 24 -#define WL1273_RDS_SYSTEM_SET 25 -#define WL1273_INT_MASK_SET 26 -#define WL1273_SEARCH_DIR_SET 27 -#define WL1273_VOLUME_SET 28 -#define WL1273_AUDIO_ENABLE 29 -#define WL1273_PCM_MODE_SET 30 -#define WL1273_I2S_MODE_CONFIG_SET 31 -#define WL1273_POWER_SET 32 -#define WL1273_INTX_CONFIG_SET 33 -#define WL1273_PULL_EN_SET 34 -#define WL1273_HILO_SET 35 -#define WL1273_SWITCH2FREF 36 -#define WL1273_FREQ_DRIFT_REPORT 37 - -#define WL1273_PCE_GET 40 -#define WL1273_FIRM_VER_GET 41 -#define WL1273_ASIC_VER_GET 42 -#define WL1273_ASIC_ID_GET 43 -#define WL1273_MAN_ID_GET 44 -#define WL1273_TUNER_MODE_SET 45 -#define WL1273_STOP_SEARCH 46 -#define WL1273_RDS_CNTRL_SET 47 - -#define WL1273_WRITE_HARDWARE_REG 100 -#define WL1273_CODE_DOWNLOAD 101 -#define WL1273_RESET 102 - -#define WL1273_FM_POWER_MODE 254 -#define WL1273_FM_INTERRUPT 255 - -/* Transmitter API */ - -#define WL1273_CHANL_SET 55 -#define WL1273_SCAN_SPACING_SET 56 -#define WL1273_REF_SET 57 -#define WL1273_POWER_ENB_SET 90 -#define WL1273_POWER_ATT_SET 58 -#define WL1273_POWER_LEV_SET 59 -#define WL1273_AUDIO_DEV_SET 60 -#define WL1273_PILOT_DEV_SET 61 -#define WL1273_RDS_DEV_SET 62 -#define WL1273_PUPD_SET 91 -#define WL1273_AUDIO_IO_SET 63 -#define WL1273_PREMPH_SET 64 -#define WL1273_MONO_SET 66 -#define WL1273_MUTE 92 -#define WL1273_MPX_LMT_ENABLE 67 -#define WL1273_PI_SET 93 -#define WL1273_ECC_SET 69 -#define WL1273_PTY 70 -#define WL1273_AF 71 -#define WL1273_DISPLAY_MODE 74 -#define WL1273_RDS_REP_SET 77 -#define WL1273_RDS_CONFIG_DATA_SET 98 -#define WL1273_RDS_DATA_SET 99 -#define WL1273_RDS_DATA_ENB 94 -#define WL1273_TA_SET 78 -#define WL1273_TP_SET 79 -#define WL1273_DI_SET 80 -#define WL1273_MS_SET 81 -#define WL1273_PS_SCROLL_SPEED 82 -#define WL1273_TX_AUDIO_LEVEL_TEST 96 -#define WL1273_TX_AUDIO_LEVEL_TEST_THRESHOLD 73 -#define WL1273_TX_AUDIO_INPUT_LEVEL_RANGE_SET 54 -#define WL1273_RX_ANTENNA_SELECT 87 -#define WL1273_I2C_DEV_ADDR_SET 86 -#define WL1273_REF_ERR_CALIB_PARAM_SET 88 -#define WL1273_REF_ERR_CALIB_PERIODICITY_SET 89 -#define WL1273_SOC_INT_TRIGGER 52 -#define WL1273_SOC_AUDIO_PATH_SET 83 -#define WL1273_SOC_PCMI_OVERRIDE 84 -#define WL1273_SOC_I2S_OVERRIDE 85 -#define WL1273_RSSI_BLOCK_SCAN_FREQ_SET 95 -#define WL1273_RSSI_BLOCK_SCAN_START 97 -#define WL1273_RSSI_BLOCK_SCAN_DATA_GET 5 -#define WL1273_READ_FMANT_TUNE_VALUE 104 - -#define WL1273_RDS_OFF 0 -#define WL1273_RDS_ON 1 -#define WL1273_RDS_RESET 2 - -#define WL1273_AUDIO_DIGITAL 0 -#define WL1273_AUDIO_ANALOG 1 - -#define WL1273_MODE_RX BIT(0) -#define WL1273_MODE_TX BIT(1) -#define WL1273_MODE_OFF BIT(2) -#define WL1273_MODE_SUSPENDED BIT(3) - -#define WL1273_RADIO_CHILD BIT(0) -#define WL1273_CODEC_CHILD BIT(1) - -#define WL1273_RX_MONO 1 -#define WL1273_RX_STEREO 0 -#define WL1273_TX_MONO 0 -#define WL1273_TX_STEREO 1 - -#define WL1273_MAX_VOLUME 0xffff -#define WL1273_DEFAULT_VOLUME 0x78b8 - -/* I2S protocol, left channel first, data width 16 bits */ -#define WL1273_PCM_DEF_MODE 0x00 - -/* Rx */ -#define WL1273_AUDIO_ENABLE_I2S BIT(0) -#define WL1273_AUDIO_ENABLE_ANALOG BIT(1) - -/* Tx */ -#define WL1273_AUDIO_IO_SET_ANALOG 0 -#define WL1273_AUDIO_IO_SET_I2S 1 - -#define WL1273_PUPD_SET_OFF 0x00 -#define WL1273_PUPD_SET_ON 0x01 -#define WL1273_PUPD_SET_RETENTION 0x10 - -/* I2S mode */ -#define WL1273_IS2_WIDTH_32 0x0 -#define WL1273_IS2_WIDTH_40 0x1 -#define WL1273_IS2_WIDTH_22_23 0x2 -#define WL1273_IS2_WIDTH_23_22 0x3 -#define WL1273_IS2_WIDTH_48 0x4 -#define WL1273_IS2_WIDTH_50 0x5 -#define WL1273_IS2_WIDTH_60 0x6 -#define WL1273_IS2_WIDTH_64 0x7 -#define WL1273_IS2_WIDTH_80 0x8 -#define WL1273_IS2_WIDTH_96 0x9 -#define WL1273_IS2_WIDTH_128 0xa -#define WL1273_IS2_WIDTH 0xf - -#define WL1273_IS2_FORMAT_STD (0x0 << 4) -#define WL1273_IS2_FORMAT_LEFT (0x1 << 4) -#define WL1273_IS2_FORMAT_RIGHT (0x2 << 4) -#define WL1273_IS2_FORMAT_USER (0x3 << 4) - -#define WL1273_IS2_MASTER (0x0 << 6) -#define WL1273_IS2_SLAVEW (0x1 << 6) - -#define WL1273_IS2_TRI_AFTER_SENDING (0x0 << 7) -#define WL1273_IS2_TRI_ALWAYS_ACTIVE (0x1 << 7) - -#define WL1273_IS2_SDOWS_RR (0x0 << 8) -#define WL1273_IS2_SDOWS_RF (0x1 << 8) -#define WL1273_IS2_SDOWS_FR (0x2 << 8) -#define WL1273_IS2_SDOWS_FF (0x3 << 8) - -#define WL1273_IS2_TRI_OPT (0x0 << 10) -#define WL1273_IS2_TRI_ALWAYS (0x1 << 10) - -#define WL1273_IS2_RATE_48K (0x0 << 12) -#define WL1273_IS2_RATE_44_1K (0x1 << 12) -#define WL1273_IS2_RATE_32K (0x2 << 12) -#define WL1273_IS2_RATE_22_05K (0x4 << 12) -#define WL1273_IS2_RATE_16K (0x5 << 12) -#define WL1273_IS2_RATE_12K (0x8 << 12) -#define WL1273_IS2_RATE_11_025 (0x9 << 12) -#define WL1273_IS2_RATE_8K (0xa << 12) -#define WL1273_IS2_RATE (0xf << 12) - -#define WL1273_I2S_DEF_MODE (WL1273_IS2_WIDTH_32 | \ - WL1273_IS2_FORMAT_STD | \ - WL1273_IS2_MASTER | \ - WL1273_IS2_TRI_AFTER_SENDING | \ - WL1273_IS2_SDOWS_RR | \ - WL1273_IS2_TRI_OPT | \ - WL1273_IS2_RATE_48K) - -#define SCHAR_MIN (-128) -#define SCHAR_MAX 127 - -#define WL1273_FR_EVENT BIT(0) -#define WL1273_BL_EVENT BIT(1) -#define WL1273_RDS_EVENT BIT(2) -#define WL1273_BBLK_EVENT BIT(3) -#define WL1273_LSYNC_EVENT BIT(4) -#define WL1273_LEV_EVENT BIT(5) -#define WL1273_IFFR_EVENT BIT(6) -#define WL1273_PI_EVENT BIT(7) -#define WL1273_PD_EVENT BIT(8) -#define WL1273_STIC_EVENT BIT(9) -#define WL1273_MAL_EVENT BIT(10) -#define WL1273_POW_ENB_EVENT BIT(11) -#define WL1273_SCAN_OVER_EVENT BIT(12) -#define WL1273_ERROR_EVENT BIT(13) - -#define TUNER_MODE_STOP_SEARCH 0 -#define TUNER_MODE_PRESET 1 -#define TUNER_MODE_AUTO_SEEK 2 -#define TUNER_MODE_AF 3 -#define TUNER_MODE_AUTO_SEEK_PI 4 -#define TUNER_MODE_AUTO_SEEK_BULK 5 - -#define RDS_BLOCK_SIZE 3 - -struct wl1273_fm_platform_data { - int (*request_resources) (struct i2c_client *client); - void (*free_resources) (void); - void (*enable) (void); - void (*disable) (void); - - u8 forbidden_modes; - unsigned int children; -}; - -#define WL1273_FM_CORE_CELLS 2 - -#define WL1273_BAND_OTHER 0 -#define WL1273_BAND_JAPAN 1 - -#define WL1273_BAND_JAPAN_LOW 76000 -#define WL1273_BAND_JAPAN_HIGH 90000 -#define WL1273_BAND_OTHER_LOW 87500 -#define WL1273_BAND_OTHER_HIGH 108000 - -#define WL1273_BAND_TX_LOW 76000 -#define WL1273_BAND_TX_HIGH 108000 - -struct wl1273_core { - struct mfd_cell cells[WL1273_FM_CORE_CELLS]; - struct wl1273_fm_platform_data *pdata; - - unsigned int mode; - unsigned int i2s_mode; - unsigned int volume; - unsigned int audio_mode; - unsigned int channel_number; - struct mutex lock; /* for serializing fm radio operations */ - - struct i2c_client *client; - - int (*read)(struct wl1273_core *core, u8, u16 *); - int (*write)(struct wl1273_core *core, u8, u16); - int (*write_data)(struct wl1273_core *core, u8 *, u16); - int (*set_audio)(struct wl1273_core *core, unsigned int); - int (*set_volume)(struct wl1273_core *core, unsigned int); -}; - -#endif /* ifndef WL1273_CORE_H */ -- cgit v1.2.3 From 6a571d762cda6c25517c5533b8bd06d56028cdcb Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 4 Nov 2025 18:39:05 +0530 Subject: soc: qcom: socinfo: Add support for new fields in revision 20 Add support for socinfo version 20. Version 20 adds a new field package id and its zeroth bit contain information that can be can be used to tune temperature thresholds on devices which might be able to withstand higher temperatures. Zeroth bit value 1 means that its heat dissipation is better and more relaxed thermal scheme can be put in place and 0 means a more aggressive scheme may be needed. Reviewed-by: Konrad Dybcio Reviewed-by: Dmitry Baryshkov Signed-off-by: Mukesh Ojha Link: https://lore.kernel.org/r/20251104130906.167666-1-mukesh.ojha@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/socinfo.c | 6 ++++++ include/linux/soc/qcom/socinfo.h | 2 ++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c index af0188bd3880..37567f5492fa 100644 --- a/drivers/soc/qcom/socinfo.c +++ b/drivers/soc/qcom/socinfo.c @@ -213,6 +213,7 @@ struct socinfo_params { u32 num_func_clusters; u32 boot_cluster; u32 boot_core; + u32 raw_package_type; }; struct smem_image_version { @@ -675,6 +676,11 @@ static void socinfo_debugfs_init(struct qcom_socinfo *qcom_socinfo, &qcom_socinfo->info.fmt); switch (qcom_socinfo->info.fmt) { + case SOCINFO_VERSION(0, 20): + qcom_socinfo->info.raw_package_type = __le32_to_cpu(info->raw_package_type); + debugfs_create_u32("raw_package_type", 0444, qcom_socinfo->dbg_root, + &qcom_socinfo->info.raw_package_type); + fallthrough; case SOCINFO_VERSION(0, 19): qcom_socinfo->info.num_func_clusters = __le32_to_cpu(info->num_func_clusters); qcom_socinfo->info.boot_cluster = __le32_to_cpu(info->boot_cluster); diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index 608950443eee..c4dae173cc30 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -82,6 +82,8 @@ struct socinfo { __le32 num_func_clusters; __le32 boot_cluster; __le32 boot_core; + /* Version 20 */ + __le32 raw_package_type; }; /* Internal feature codes */ -- cgit v1.2.3 From 6918667af5a7315eff3c56d871be4c5439f7f9d2 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 4 Nov 2025 18:39:06 +0530 Subject: soc: qcom: socinfo: Add reserve field to support future extension Some of the new field added to socinfo structure with version 21, 22 and 23 which is only used by boot firmware and it is of no use for Linux.Add reserve field in socinfo so that the structure remain updated and prepared if we get any new field in future which could be used by Linux. While at it, also updates switch case for backward compatibility if the SoC runs with boot firmware which has these new version added. Signed-off-by: Mukesh Ojha Link: https://lore.kernel.org/r/20251104130906.167666-2-mukesh.ojha@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/socinfo.c | 3 +++ include/linux/soc/qcom/socinfo.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c index 37567f5492fa..003a2304d535 100644 --- a/drivers/soc/qcom/socinfo.c +++ b/drivers/soc/qcom/socinfo.c @@ -676,6 +676,9 @@ static void socinfo_debugfs_init(struct qcom_socinfo *qcom_socinfo, &qcom_socinfo->info.fmt); switch (qcom_socinfo->info.fmt) { + case SOCINFO_VERSION(0, 23): + case SOCINFO_VERSION(0, 22): + case SOCINFO_VERSION(0, 21): case SOCINFO_VERSION(0, 20): qcom_socinfo->info.raw_package_type = __le32_to_cpu(info->raw_package_type); debugfs_create_u32("raw_package_type", 0444, qcom_socinfo->dbg_root, diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index c4dae173cc30..ba823a0013c5 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -84,6 +84,8 @@ struct socinfo { __le32 boot_core; /* Version 20 */ __le32 raw_package_type; + /* Version 21, 22, 23 */ + __le32 reserve1[4]; }; /* Internal feature codes */ -- cgit v1.2.3 From 38724a474c0fc37b6604e8b20c75d87446fc2fd1 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Thu, 30 Oct 2025 14:59:46 +0100 Subject: ice: add virtchnl definitions and static data for GTP RSS Add virtchnl protocol header and field definitions for advanced RSS configuration including GTPC, GTPU, L2TPv2, ECPRI, PPP, GRE, and IP fragment headers. - Define new virtchnl protocol header types - Add RSS field selectors for tunnel protocols - Extend static mapping arrays for protocol field matching - Add L2TPv2 session ID and length+session ID field support This provides the foundational definitions needed for VF RSS configuration of tunnel protocols. Co-developed-by: Dan Nowlin Signed-off-by: Dan Nowlin Co-developed-by: Jie Wang Signed-off-by: Jie Wang Co-developed-by: Junfeng Guo Signed-off-by: Junfeng Guo Co-developed-by: Qi Zhang Signed-off-by: Qi Zhang Co-developed-by: Ting Xu Signed-off-by: Ting Xu Signed-off-by: Przemek Kitszel Signed-off-by: Aleksandr Loktionov Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_vf_lib.h | 48 ++++++ drivers/net/ethernet/intel/ice/virt/rss.c | 219 +++++++++++++++++++++++++++- include/linux/avf/virtchnl.h | 50 +++++++ 3 files changed, 316 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.h b/drivers/net/ethernet/intel/ice/ice_vf_lib.h index b00708907176..7a9c75d1d07c 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.h @@ -53,6 +53,46 @@ struct ice_mdd_vf_events { u16 last_printed; }; +enum ice_hash_ip_ctx_type { + ICE_HASH_IP_CTX_IP = 0, + ICE_HASH_IP_CTX_IP_ESP, + ICE_HASH_IP_CTX_IP_UDP_ESP, + ICE_HASH_IP_CTX_IP_AH, + ICE_HASH_IP_CTX_IP_PFCP, + ICE_HASH_IP_CTX_IP_UDP, + ICE_HASH_IP_CTX_IP_TCP, + ICE_HASH_IP_CTX_IP_SCTP, + ICE_HASH_IP_CTX_MAX, +}; + +struct ice_vf_hash_ip_ctx { + struct ice_rss_hash_cfg ctx[ICE_HASH_IP_CTX_MAX]; +}; + +enum ice_hash_gtpu_ctx_type { + ICE_HASH_GTPU_CTX_EH_IP = 0, + ICE_HASH_GTPU_CTX_EH_IP_UDP, + ICE_HASH_GTPU_CTX_EH_IP_TCP, + ICE_HASH_GTPU_CTX_UP_IP, + ICE_HASH_GTPU_CTX_UP_IP_UDP, + ICE_HASH_GTPU_CTX_UP_IP_TCP, + ICE_HASH_GTPU_CTX_DW_IP, + ICE_HASH_GTPU_CTX_DW_IP_UDP, + ICE_HASH_GTPU_CTX_DW_IP_TCP, + ICE_HASH_GTPU_CTX_MAX, +}; + +struct ice_vf_hash_gtpu_ctx { + struct ice_rss_hash_cfg ctx[ICE_HASH_GTPU_CTX_MAX]; +}; + +struct ice_vf_hash_ctx { + struct ice_vf_hash_ip_ctx v4; + struct ice_vf_hash_ip_ctx v6; + struct ice_vf_hash_gtpu_ctx ipv4; + struct ice_vf_hash_gtpu_ctx ipv6; +}; + /* Structure to store fdir fv entry */ struct ice_fdir_prof_info { struct ice_parser_profile prof; @@ -66,6 +106,12 @@ struct ice_vf_qs_bw { u8 tc; }; +/* Structure to store RSS field vector entry */ +struct ice_rss_prof_info { + struct ice_parser_profile prof; + bool symm; +}; + /* VF operations */ struct ice_vf_ops { enum ice_disq_rst_src reset_type; @@ -106,6 +152,8 @@ struct ice_vf { u16 ctrl_vsi_idx; struct ice_vf_fdir fdir; struct ice_fdir_prof_info fdir_prof_info[ICE_MAX_PTGS]; + struct ice_rss_prof_info rss_prof_info[ICE_MAX_PTGS]; + struct ice_vf_hash_ctx hash_ctx; u64 rss_hashcfg; /* RSS hash configuration */ struct ice_sw *vf_sw_id; /* switch ID the VF VSIs connect to */ struct virtchnl_version_info vf_ver; diff --git a/drivers/net/ethernet/intel/ice/virt/rss.c b/drivers/net/ethernet/intel/ice/virt/rss.c index cbdbb32d512b..ee0d1ec32d56 100644 --- a/drivers/net/ethernet/intel/ice/virt/rss.c +++ b/drivers/net/ethernet/intel/ice/virt/rss.c @@ -36,6 +36,11 @@ static const struct ice_vc_hdr_match_type ice_vc_hdr_list[] = { {VIRTCHNL_PROTO_HDR_ESP, ICE_FLOW_SEG_HDR_ESP}, {VIRTCHNL_PROTO_HDR_AH, ICE_FLOW_SEG_HDR_AH}, {VIRTCHNL_PROTO_HDR_PFCP, ICE_FLOW_SEG_HDR_PFCP_SESSION}, + {VIRTCHNL_PROTO_HDR_GTPC, ICE_FLOW_SEG_HDR_GTPC}, + {VIRTCHNL_PROTO_HDR_L2TPV2, ICE_FLOW_SEG_HDR_L2TPV2}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, ICE_FLOW_SEG_HDR_IPV_FRAG}, + {VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG, ICE_FLOW_SEG_HDR_IPV_FRAG}, + {VIRTCHNL_PROTO_HDR_GRE, ICE_FLOW_SEG_HDR_GRE}, }; struct ice_vc_hash_field_match_type { @@ -87,8 +92,125 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = { FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, - {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + {VIRTCHNL_PROTO_HDR_IPV4, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_FRAG_PKID), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_ID)}, + {VIRTCHNL_PROTO_HDR_IPV4, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST), + ICE_FLOW_HASH_IPV4}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_FRAG_PKID), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_ID)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_IPV4_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_CHKSUM)}, {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC), BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA)}, {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST), @@ -110,6 +232,35 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = { ICE_FLOW_HASH_IPV6 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG_PKID), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_ID)}, + {VIRTCHNL_PROTO_HDR_IPV6, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST), + ICE_FLOW_HASH_IPV6_PRE64}, + {VIRTCHNL_PROTO_HDR_IPV6, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA)}, + {VIRTCHNL_PROTO_HDR_IPV6, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA)}, + {VIRTCHNL_PROTO_HDR_IPV6, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + ICE_FLOW_HASH_IPV6_PRE64 | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_SA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PRE64_DA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, {VIRTCHNL_PROTO_HDR_TCP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT), BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT)}, @@ -120,6 +271,25 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = { FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) | FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT), ICE_FLOW_HASH_TCP_PORT}, + {VIRTCHNL_PROTO_HDR_TCP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_TCP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_TCP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_TCP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_CHKSUM), + ICE_FLOW_HASH_TCP_PORT | + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_CHKSUM)}, {VIRTCHNL_PROTO_HDR_UDP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT), BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT)}, @@ -130,6 +300,25 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = { FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) | FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT), ICE_FLOW_HASH_UDP_PORT}, + {VIRTCHNL_PROTO_HDR_UDP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_UDP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_UDP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_UDP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_CHKSUM), + ICE_FLOW_HASH_UDP_PORT | + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_CHKSUM)}, {VIRTCHNL_PROTO_HDR_SCTP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT), BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT)}, @@ -140,6 +329,25 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = { FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) | FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT), ICE_FLOW_HASH_SCTP_PORT}, + {VIRTCHNL_PROTO_HDR_SCTP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_SCTP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_SCTP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_CHKSUM), + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT) | + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_CHKSUM)}, + {VIRTCHNL_PROTO_HDR_SCTP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_CHKSUM), + ICE_FLOW_HASH_SCTP_PORT | + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_CHKSUM)}, {VIRTCHNL_PROTO_HDR_PPPOE, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PPPOE_SESS_ID), BIT_ULL(ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID)}, @@ -155,6 +363,15 @@ ice_vc_hash_field_match_type ice_vc_hash_field_list[] = { BIT_ULL(ICE_FLOW_FIELD_IDX_AH_SPI)}, {VIRTCHNL_PROTO_HDR_PFCP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PFCP_SEID), BIT_ULL(ICE_FLOW_FIELD_IDX_PFCP_SEID)}, + {VIRTCHNL_PROTO_HDR_GTPC, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_GTPC_TEID), + BIT_ULL(ICE_FLOW_FIELD_IDX_GTPC_TEID)}, + {VIRTCHNL_PROTO_HDR_L2TPV2, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_L2TPV2_SESS_ID), + BIT_ULL(ICE_FLOW_FIELD_IDX_L2TPV2_SESS_ID)}, + {VIRTCHNL_PROTO_HDR_L2TPV2, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_L2TPV2_LEN_SESS_ID), + BIT_ULL(ICE_FLOW_FIELD_IDX_L2TPV2_LEN_SESS_ID)}, }; /** diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 5be1881abbb6..11bdab5522fd 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -1253,6 +1253,17 @@ enum virtchnl_proto_hdr_type { VIRTCHNL_PROTO_HDR_ESP, VIRTCHNL_PROTO_HDR_AH, VIRTCHNL_PROTO_HDR_PFCP, + VIRTCHNL_PROTO_HDR_GTPC, + VIRTCHNL_PROTO_HDR_ECPRI, + VIRTCHNL_PROTO_HDR_L2TPV2, + VIRTCHNL_PROTO_HDR_PPP, + /* IPv4 and IPv6 Fragment header types are only associated to + * VIRTCHNL_PROTO_HDR_IPV4 and VIRTCHNL_PROTO_HDR_IPV6 respectively, + * cannot be used independently. + */ + VIRTCHNL_PROTO_HDR_IPV4_FRAG, + VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG, + VIRTCHNL_PROTO_HDR_GRE, }; /* Protocol header field within a protocol header. */ @@ -1275,6 +1286,7 @@ enum virtchnl_proto_hdr_field { VIRTCHNL_PROTO_HDR_IPV4_DSCP, VIRTCHNL_PROTO_HDR_IPV4_TTL, VIRTCHNL_PROTO_HDR_IPV4_PROT, + VIRTCHNL_PROTO_HDR_IPV4_CHKSUM, /* IPV6 */ VIRTCHNL_PROTO_HDR_IPV6_SRC = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6), @@ -1282,18 +1294,34 @@ enum virtchnl_proto_hdr_field { VIRTCHNL_PROTO_HDR_IPV6_TC, VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT, VIRTCHNL_PROTO_HDR_IPV6_PROT, + /* IPV6 Prefix */ + VIRTCHNL_PROTO_HDR_IPV6_PREFIX32_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX32_DST, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX40_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX40_DST, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX48_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX48_DST, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX56_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX56_DST, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX96_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX96_DST, /* TCP */ VIRTCHNL_PROTO_HDR_TCP_SRC_PORT = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_TCP), VIRTCHNL_PROTO_HDR_TCP_DST_PORT, + VIRTCHNL_PROTO_HDR_TCP_CHKSUM, /* UDP */ VIRTCHNL_PROTO_HDR_UDP_SRC_PORT = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_UDP), VIRTCHNL_PROTO_HDR_UDP_DST_PORT, + VIRTCHNL_PROTO_HDR_UDP_CHKSUM, /* SCTP */ VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_SCTP), VIRTCHNL_PROTO_HDR_SCTP_DST_PORT, + VIRTCHNL_PROTO_HDR_SCTP_CHKSUM, /* GTPU_IP */ VIRTCHNL_PROTO_HDR_GTPU_IP_TEID = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_IP), @@ -1317,6 +1345,28 @@ enum virtchnl_proto_hdr_field { VIRTCHNL_PROTO_HDR_PFCP_S_FIELD = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PFCP), VIRTCHNL_PROTO_HDR_PFCP_SEID, + /* GTPC */ + VIRTCHNL_PROTO_HDR_GTPC_TEID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPC), + /* ECPRI */ + VIRTCHNL_PROTO_HDR_ECPRI_MSG_TYPE = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ECPRI), + VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID, + /* IPv4 Dummy Fragment */ + VIRTCHNL_PROTO_HDR_IPV4_FRAG_PKID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV4_FRAG), + /* IPv6 Extension Fragment */ + VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG_PKID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG), + /* GTPU_DWN/UP */ + VIRTCHNL_PROTO_HDR_GTPU_DWN_QFI = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN), + VIRTCHNL_PROTO_HDR_GTPU_UP_QFI = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP), + /* L2TPv2 */ + VIRTCHNL_PROTO_HDR_L2TPV2_SESS_ID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_L2TPV2), + VIRTCHNL_PROTO_HDR_L2TPV2_LEN_SESS_ID, }; struct virtchnl_proto_hdr { -- cgit v1.2.3 From 9311e6c29b348b005e79228ef6facd38ebcc73f9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Nov 2025 08:12:36 -1000 Subject: cgroup: Fix sleeping from invalid context warning on PREEMPT_RT cgroup_task_dead() is called from finish_task_switch() which runs with preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The function needs to acquire css_set_lock which is a regular spinlock that can sleep on RT kernels, leading to "sleeping function called from invalid context" warnings. css_set_lock is too large in scope to convert to a raw_spinlock. However, the unlinking operations don't need to run synchronously - they just need to complete after the task is done running. On PREEMPT_RT, defer the work through irq_work. While the work doesn't need to happen immediately, it can't be delayed indefinitely either as the dead task pins the cgroup and task_struct can be pinned indefinitely. Use the lazy version of irq_work to allow batching and lower impact while ensuring timely completion. v2: Use IRQ_WORK_INIT_LAZY instead of immediate irq_work and add explanation for why the work can't be delayed indefinitely (Sebastian Andrzej Siewior). Fixes: d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out") Reported-by: Calvin Owens Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org Signed-off-by: Tejun Heo --- include/linux/sched.h | 5 ++++- kernel/cgroup/cgroup.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..5e80d48488ef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1324,7 +1324,10 @@ struct task_struct { struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; -#endif +#ifdef CONFIG_PREEMPT_RT + struct llist_node cg_dead_lnode; +#endif /* CONFIG_PREEMPT_RT */ +#endif /* CONFIG_CGROUPS */ #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index aae180d56c8c..48019a661c08 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -290,6 +290,7 @@ static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static void cgroup_rt_init(void); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline @@ -6360,6 +6361,7 @@ int __init cgroup_init(void) BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); + cgroup_rt_init(); cgroup_lock(); @@ -6990,7 +6992,7 @@ void cgroup_task_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_task_dead(struct task_struct *tsk) +static void do_cgroup_task_dead(struct task_struct *tsk) { struct css_set *cset; unsigned long flags; @@ -7016,6 +7018,57 @@ void cgroup_task_dead(struct task_struct *tsk) spin_unlock_irqrestore(&css_set_lock, flags); } +#ifdef CONFIG_PREEMPT_RT +/* + * cgroup_task_dead() is called from finish_task_switch() which doesn't allow + * scheduling even in RT. As the task_dead path requires grabbing css_set_lock, + * this lead to sleeping in the invalid context warning bug. css_set_lock is too + * big to become a raw_spinlock. The task_dead path doesn't need to run + * synchronously but can't be delayed indefinitely either as the dead task pins + * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy + * irq_work to allow batching while ensuring timely completion. + */ +static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks); +static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork); + +static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork) +{ + struct llist_node *lnode; + struct task_struct *task, *next; + + lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks)); + llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) { + do_cgroup_task_dead(task); + put_task_struct(task); + } +} + +static void __init cgroup_rt_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu)); + per_cpu(cgrp_dead_tasks_iwork, cpu) = + IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn); + } +} + +void cgroup_task_dead(struct task_struct *task) +{ + get_task_struct(task); + llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks)); + irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork)); +} +#else /* CONFIG_PREEMPT_RT */ +static void __init cgroup_rt_init(void) {} + +void cgroup_task_dead(struct task_struct *task) +{ + do_cgroup_task_dead(task); +} +#endif /* CONFIG_PREEMPT_RT */ + void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; -- cgit v1.2.3 From 15638d52cbcf6e969f4a5e2757b118355db583f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Nov 2025 14:52:15 -0500 Subject: block: fix cached zone reporting after zone append was used No zone plugs are allocated when a zone is opened by calling Zone Append on it. This makes the cached zone reporting report incorrectly empty zones if the file system is unmounted and report zones is called after that, e.g. by xfstests test cases using the scratch device. Fix this by recording if zone append was used on a device, and disable cached reporting for the device until a ZONE_RESET_ALL happens that guarantees all zones are empty. We could probably do even better using a per-zone flag, but the practical use cache for zone reporting after the initial mount are rather limited, so let's keep things simple for now. Fixes: 31f0656a4ab7 ("block: introduce blkdev_report_zones_cached()") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 26 +++++++++++++++++++++----- include/linux/blkdev.h | 1 + 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index a0ce17e2143f..c5226bcaaa94 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -899,6 +899,19 @@ static int blkdev_report_zone_fallback(struct block_device *bdev, return blkdev_do_report_zones(bdev, sector, 1, &args); } +/* + * For devices that natively support zone append operations, we do not use zone + * write plugging for zone append writes, which makes the zone condition + * tracking invalid once zone append was used. In that case fall back to a + * regular report zones to get correct information. + */ +static inline bool blkdev_has_cached_report_zones(struct block_device *bdev) +{ + return disk_need_zone_resources(bdev->bd_disk) && + (bdev_emulates_zone_append(bdev) || + !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state)); +} + /** * blkdev_get_zone_info - Get a single zone information from cached data * @bdev: Target block device @@ -932,6 +945,9 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, memset(zone, 0, sizeof(*zone)); sector = ALIGN_DOWN(sector, zone_sectors); + if (!blkdev_has_cached_report_zones(bdev)) + return blkdev_report_zone_fallback(bdev, sector, zone); + rcu_read_lock(); zones_cond = rcu_dereference(disk->zones_cond); if (!disk->zone_wplugs_hash || !zones_cond) { @@ -1035,11 +1051,7 @@ int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, if (!nr_zones || sector >= capacity) return 0; - /* - * If we do not have any zone write plug resources, fallback to using - * the regular zone report. - */ - if (!disk_need_zone_resources(disk)) { + if (!blkdev_has_cached_report_zones(bdev)) { struct blk_report_zones_args args = { .cb = cb, .data = data, @@ -1115,6 +1127,7 @@ static void blk_zone_reset_all_bio_endio(struct bio *bio) for (sector = 0; sector < capacity; sector += bdev_zone_sectors(bio->bi_bdev)) disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); + clear_bit(GD_ZONE_APPEND_USED, &disk->state); } static void blk_zone_finish_bio_endio(struct bio *bio) @@ -1474,6 +1487,9 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) struct blk_zone_wplug *zwplug; unsigned long flags; + if (!test_bit(GD_ZONE_APPEND_USED, &disk->state)) + set_bit(GD_ZONE_APPEND_USED, &disk->state); + /* * We have native support for zone append operations, so we are not * going to handle @bio through plugging. However, we may already have a diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f0ab02e0a673..6a498aa7f7e7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -173,6 +173,7 @@ struct gendisk { #define GD_ADDED 4 #define GD_SUPPRESS_PART_SCAN 5 #define GD_OWNS_QUEUE 6 +#define GD_ZONE_APPEND_USED 7 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ -- cgit v1.2.3 From 2f6b2565d43cdb5087cac23d530cca84aa3d897e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 14 Oct 2025 08:04:55 -0700 Subject: block: accumulate memory segment gaps per bio The blk-mq dma iterator has an optimization for requests that align to the device's iommu merge boundary. This boundary may be larger than the device's virtual boundary, but the code had been depending on that queue limit to know ahead of time if the request is guaranteed to align to that optimization. Rather than rely on that queue limit, which many devices may not report, save the lowest set bit of any boundary gap between each segment in the bio while checking the segments. The request stores the value for merging and quickly checking per io if the request can use iova optimizations. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 1 + block/blk-map.c | 3 +++ block/blk-merge.c | 39 ++++++++++++++++++++++++++++++++++++--- block/blk-mq-dma.c | 3 +-- block/blk-mq.c | 6 ++++++ include/linux/bio.h | 2 ++ include/linux/blk-mq.h | 16 ++++++++++++++++ include/linux/blk_types.h | 12 ++++++++++++ 8 files changed, 77 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index b3a79285c278..7b13bdf72de0 100644 --- a/block/bio.c +++ b/block/bio.c @@ -253,6 +253,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_write_hint = 0; bio->bi_write_stream = 0; bio->bi_status = 0; + bio->bi_bvec_gap_bit = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; bio->bi_iter.bi_idx = 0; diff --git a/block/blk-map.c b/block/blk-map.c index 60faf036fb6e..17a1dc288678 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -459,6 +459,8 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) if (rq->bio) { if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; + rq->phys_gap_bit = bio_seg_gap(rq->q, rq->biotail, bio, + rq->phys_gap_bit); rq->biotail->bi_next = bio; rq->biotail = bio; rq->__data_len += bio->bi_iter.bi_size; @@ -469,6 +471,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) rq->nr_phys_segments = nr_segs; rq->bio = rq->biotail = bio; rq->__data_len = bio->bi_iter.bi_size; + rq->phys_gap_bit = bio->bi_bvec_gap_bit; return 0; } EXPORT_SYMBOL(blk_rq_append_bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index c47d18587a0b..3ca6fbf8b787 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -302,6 +302,12 @@ static unsigned int bio_split_alignment(struct bio *bio, return lim->logical_block_size; } +static inline unsigned int bvec_seg_gap(struct bio_vec *bvprv, + struct bio_vec *bv) +{ + return bv->bv_offset | (bvprv->bv_offset + bvprv->bv_len); +} + /** * bio_split_io_at - check if and where to split a bio * @bio: [in] bio to be split @@ -319,8 +325,8 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, unsigned *segs, unsigned max_bytes, unsigned len_align_mask) { struct bio_vec bv, bvprv, *bvprvp = NULL; + unsigned nsegs = 0, bytes = 0, gaps = 0; struct bvec_iter iter; - unsigned nsegs = 0, bytes = 0; bio_for_each_bvec(bv, bio, iter) { if (bv.bv_offset & lim->dma_alignment || @@ -331,8 +337,11 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ - if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset)) - goto split; + if (bvprvp) { + if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset)) + goto split; + gaps |= bvec_seg_gap(bvprvp, &bv); + } if (nsegs < lim->max_segments && bytes + bv.bv_len <= max_bytes && @@ -350,6 +359,7 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, } *segs = nsegs; + bio->bi_bvec_gap_bit = ffs(gaps); return 0; split: if (bio->bi_opf & REQ_ATOMIC) @@ -385,6 +395,7 @@ split: * big IO can be trival, disable iopoll when split needed. */ bio_clear_polled(bio); + bio->bi_bvec_gap_bit = ffs(gaps); return bytes >> SECTOR_SHIFT; } EXPORT_SYMBOL_GPL(bio_split_io_at); @@ -721,6 +732,21 @@ static bool blk_atomic_write_mergeable_rqs(struct request *rq, return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC); } +u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, + u8 gaps_bit) +{ + struct bio_vec pb, nb; + + gaps_bit = min_not_zero(gaps_bit, prev->bi_bvec_gap_bit); + gaps_bit = min_not_zero(gaps_bit, next->bi_bvec_gap_bit); + + bio_get_last_bvec(prev, &pb); + bio_get_first_bvec(next, &nb); + if (!biovec_phys_mergeable(q, &pb, &nb)) + gaps_bit = min_not_zero(gaps_bit, ffs(bvec_seg_gap(&pb, &nb))); + return gaps_bit; +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -785,6 +811,9 @@ static struct request *attempt_merge(struct request_queue *q, if (next->start_time_ns < req->start_time_ns) req->start_time_ns = next->start_time_ns; + req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, next->bio, + min_not_zero(next->phys_gap_bit, + req->phys_gap_bit)); req->biotail->bi_next = next->bio; req->biotail = next->biotail; @@ -908,6 +937,8 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req, if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) blk_zone_write_plug_bio_merged(bio); + req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, bio, + req->phys_gap_bit); req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; @@ -942,6 +973,8 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, blk_update_mixed_merge(req, bio, true); + req->phys_gap_bit = bio_seg_gap(req->q, bio, req->bio, + req->phys_gap_bit); bio->bi_next = req->bio; req->bio = bio; diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 449950029872..94d3461b5bc8 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -79,8 +79,7 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, static inline bool blk_can_dma_map_iova(struct request *req, struct device *dma_dev) { - return !((queue_virt_boundary(req->q) + 1) & - dma_get_merge_boundary(dma_dev)); + return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev)); } static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) diff --git a/block/blk-mq.c b/block/blk-mq.c index d626d32f6e57..b2fdeaac0efb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; + rq->phys_gap_bit = 0; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; @@ -668,6 +669,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, goto out_queue_exit; } rq->__data_len = 0; + rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; @@ -748,6 +750,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; + rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; @@ -2674,6 +2677,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; rq->__data_len = bio->bi_iter.bi_size; + rq->phys_gap_bit = bio->bi_bvec_gap_bit; + rq->nr_phys_segments = nr_segs; if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, @@ -3380,6 +3385,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->nr_integrity_segments = rq_src->nr_integrity_segments; + rq->phys_gap_bit = rq_src->phys_gap_bit; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; diff --git a/include/linux/bio.h b/include/linux/bio.h index 16c1c85613b7..ad2d57908c1c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,6 +324,8 @@ extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, unsigned *segs, unsigned max_bytes, unsigned len_align); +u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, + u8 gaps_bit); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b25d12545f46..b54506b3b76d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -152,6 +152,14 @@ struct request { unsigned short nr_phys_segments; unsigned short nr_integrity_segments; + /* + * The lowest set bit for address gaps between physical segments. This + * provides information necessary for dma optimization opprotunities, + * like for testing if the segments can be coalesced against the + * device's iommu granule. + */ + unsigned char phys_gap_bit; + #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; struct blk_crypto_keyslot *crypt_keyslot; @@ -208,6 +216,14 @@ struct request { void *end_io_data; }; +/* + * Returns a mask with all bits starting at req->phys_gap_bit set to 1. + */ +static inline unsigned long req_phys_gap_mask(const struct request *req) +{ + return ~(((1 << req->phys_gap_bit) >> 1) - 1); +} + static inline enum req_op req_op(const struct request *req) { return req->cmd_flags & REQ_OP_MASK; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8e8d1cc8b06c..53501ebb0623 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -218,6 +218,18 @@ struct bio { enum rw_hint bi_write_hint; u8 bi_write_stream; blk_status_t bi_status; + + /* + * The bvec gap bit indicates the lowest set bit in any address offset + * between all bi_io_vecs. This field is initialized only after the bio + * is split to the hardware limits (see bio_split_io_at()). The value + * may be used to consider DMA optimization when performing that + * mapping. The value is compared to a power of two mask where the + * result depends on any bit set within the mask, so saving the lowest + * bit is sufficient to know if any segment gap collides with the mask. + */ + u8 bi_bvec_gap_bit; + atomic_t __bi_remaining; struct bvec_iter bi_iter; -- cgit v1.2.3 From 4c0a17e28340e458627d672564200406e220d6a3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Nov 2025 10:05:33 +0100 Subject: slab: prevent recursive kmalloc() in alloc_empty_sheaf() We want to expand usage of sheaves to all non-boot caches, including kmalloc caches. Since sheaves themselves are also allocated by kmalloc(), we need to prevent excessive or infinite recursion - depending on sheaf size, the sheaf can be allocated from smaller, same or larger kmalloc size bucket, there's no particular constraint. This is similar to allocating the objext arrays so let's just reuse the existing mechanisms for those. __GFP_NO_OBJ_EXT in alloc_empty_sheaf() will prevent a nested kmalloc() from allocating a sheaf itself - it will either have sheaves already, or fallback to a non-sheaf-cached allocation (so bootstrap of sheaves in a kmalloc cache that allocates sheaves from its own size bucket is possible). Additionally, reuse OBJCGS_CLEAR_MASK to clear unwanted gfp flags from the nested allocation. Link: https://patch.msgid.link/20251105-sheaves-cleanups-v1-5-b8218e1ac7ef@suse.cz Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/gfp_types.h | 6 ------ mm/slub.c | 36 ++++++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 65db9349f905..3de43b12209e 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -55,9 +55,7 @@ enum { #ifdef CONFIG_LOCKDEP ___GFP_NOLOCKDEP_BIT, #endif -#ifdef CONFIG_SLAB_OBJ_EXT ___GFP_NO_OBJ_EXT_BIT, -#endif ___GFP_LAST_BIT }; @@ -98,11 +96,7 @@ enum { #else #define ___GFP_NOLOCKDEP 0 #endif -#ifdef CONFIG_SLAB_OBJ_EXT #define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT) -#else -#define ___GFP_NO_OBJ_EXT 0 -#endif /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) diff --git a/mm/slub.c b/mm/slub.c index a7c6d79154f8..f729c208965b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2031,6 +2031,14 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, } #endif /* CONFIG_SLUB_DEBUG */ +/* + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. + */ +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) + #ifdef CONFIG_SLAB_OBJ_EXT #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG @@ -2081,14 +2089,6 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -/* - * The allocated objcg pointers array is not accounted directly. - * Moreover, it should not come from DMA buffer and is not readily - * reclaimable. So those GFP bits should be masked off. - */ -#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ - __GFP_ACCOUNT | __GFP_NOFAIL) - static inline void init_slab_obj_exts(struct slab *slab) { slab->obj_exts = 0; @@ -2596,8 +2596,24 @@ static void *setup_object(struct kmem_cache *s, void *object) static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) { - struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, - s->sheaf_capacity), gfp); + struct slab_sheaf *sheaf; + size_t sheaf_size; + + if (gfp & __GFP_NO_OBJ_EXT) + return NULL; + + gfp &= ~OBJCGS_CLEAR_MASK; + + /* + * Prevent recursion to the same cache, or a deep stack of kmallocs of + * varying sizes (sheaf capacity might differ for each kmalloc size + * bucket) + */ + if (s->flags & SLAB_KMALLOC) + gfp |= __GFP_NO_OBJ_EXT; + + sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); + sheaf = kzalloc(sheaf_size, gfp); if (unlikely(!sheaf)) return NULL; -- cgit v1.2.3 From ce284f882022ebcb953984c7eccf4fc4eb531978 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Thu, 16 Oct 2025 15:38:01 +0200 Subject: pwm: Export `pwmchip_release` for external use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The upcoming Rust abstraction layer for the PWM subsystem uses a custom `dev->release` handler to safely manage the lifetime of its driver data. To prevent leaking the memory of the `struct pwm_chip` (allocated by `pwmchip_alloc`), this custom handler must also call the original `pwmchip_release` function to complete the cleanup. Make `pwmchip_release` a global, exported function so that it can be called from the Rust FFI bridge. This involves removing the `static` keyword, adding a prototype to the public header, and exporting the symbol. Reviewed-by: Elle Rhumsaa Signed-off-by: Michal Wilczynski Link: https://patch.msgid.link/20251016-rust-next-pwm-working-fan-for-sending-v16-1-a5df2405d2bd@samsung.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 3 ++- include/linux/pwm.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index ea2ccf42e814..47c9333baaf6 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -1608,12 +1608,13 @@ void pwmchip_put(struct pwm_chip *chip) } EXPORT_SYMBOL_GPL(pwmchip_put); -static void pwmchip_release(struct device *pwmchip_dev) +void pwmchip_release(struct device *pwmchip_dev) { struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev); kfree(chip); } +EXPORT_SYMBOL_GPL(pwmchip_release); struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t sizeof_priv) { diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 549ac4aaad59..148f056f336b 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -488,6 +488,12 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner); #define pwmchip_add(chip) __pwmchip_add(chip, THIS_MODULE) void pwmchip_remove(struct pwm_chip *chip); +/* + * For FFI wrapper use only: + * The Rust PWM abstraction needs this to properly free the pwm_chip. + */ +void pwmchip_release(struct device *dev); + int __devm_pwmchip_add(struct device *dev, struct pwm_chip *chip, struct module *owner); #define devm_pwmchip_add(dev, chip) __devm_pwmchip_add(dev, chip, THIS_MODULE) -- cgit v1.2.3 From 37827223f86aa71b267769d5f51ca16b44b45ae5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:12 -0800 Subject: srcu: Add SRCU_READ_FLAVOR_FAST_UPDOWN CPP macro This commit adds the SRCU_READ_FLAVOR_FAST_UPDOWN=0x8 macro and adjusts rcutorture to make use of it. In this commit, both SRCU_READ_FLAVOR_FAST=0x4 and the new SRCU_READ_FLAVOR_FAST_UPDOWN test SRCU-fast. When the SRCU-fast-updown is added, the new SRCU_READ_FLAVOR_FAST_UPDOWN macro will test it when passed to the rcutorture.reader_flavor module parameter. The old SRCU_READ_FLAVOR_FAST macro's value changed from 0x8 to 0x4. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 16 +++++++++------- kernel/rcu/rcutorture.c | 24 ++++++++++++++++++------ 2 files changed, 27 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 41e27c1d917d..1dd6812aabe7 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -56,13 +56,15 @@ int init_srcu_struct_fast(struct srcu_struct *ssp); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */ -#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). -#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). -// 0x4 // SRCU-lite is no longer with us. -#define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast(). -#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ - SRCU_READ_FLAVOR_FAST) // All of the above. -#define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_FAST +#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). +#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). +// 0x4 // SRCU-lite is no longer with us. +#define SRCU_READ_FLAVOR_FAST 0x4 // srcu_read_lock_fast(). +#define SRCU_READ_FLAVOR_FAST_UPDOWN 0x8 // srcu_read_lock_fast(). +#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ + SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN) + // All of the above. +#define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN) // Flavors requiring synchronize_rcu() // instead of smp_mb(). void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ec9d474d60cb..8973cae0a3ef 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -702,6 +702,8 @@ static void srcu_torture_init(void) rcu_sync_torture_init(); if (reader_flavor & SRCU_READ_FLAVOR_FAST) srcu_ctlp = &srcu_ctlf; + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) + srcu_ctlp = &srcu_ctlf; } static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) @@ -728,6 +730,12 @@ static int srcu_torture_read_lock(void) ret += idx << 1; } if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 2; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { scp = srcu_read_lock_fast(srcu_ctlp); idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); WARN_ON_ONCE(idx & ~0x1); @@ -758,8 +766,10 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); - if (reader_flavor & SRCU_READ_FLAVOR_FAST) + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 2)); if (reader_flavor & SRCU_READ_FLAVOR_NMI) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) @@ -793,7 +803,7 @@ static int srcu_torture_down_read(void) WARN_ON_ONCE(idx & ~0x1); return idx; } - if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { scp = srcu_down_read_fast(srcu_ctlp); idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); WARN_ON_ONCE(idx & ~0x1); @@ -806,7 +816,7 @@ static int srcu_torture_down_read(void) static void srcu_torture_up_read(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); - if (reader_flavor & SRCU_READ_FLAVOR_FAST) + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) @@ -901,14 +911,16 @@ static struct rcu_torture_ops srcu_ops = { .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) - ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcu" }; static void srcud_torture_init(void) { rcu_sync_torture_init(); - if (reader_flavor & SRCU_READ_FLAVOR_FAST) + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) + WARN_ON(init_srcu_struct_fast(&srcu_ctld)); + else if (reader_flavor & SRCU_READ_FLAVOR_FAST) WARN_ON(init_srcu_struct_fast(&srcu_ctld)); else WARN_ON(init_srcu_struct(&srcu_ctld)); @@ -953,7 +965,7 @@ static struct rcu_torture_ops srcud_ops = { .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) - ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcud" }; -- cgit v1.2.3 From 187de7c212e5fa87779e1026bf949337bca0cdaa Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 27 Oct 2025 17:18:03 +0106 Subject: printk: nbcon: Allow unsafe write_atomic() for panic There may be console drivers that have not yet figured out a way to implement safe atomic printing (->write_atomic() callback). These drivers could choose to only implement threaded printing (->write_thread() callback), but then it is guaranteed that _no_ output will be printed during panic. Not even attempted. As a result, developers may be tempted to implement unsafe ->write_atomic() callbacks and/or implement some sort of custom deferred printing trickery to try to make it work. This goes against the principle intention of the nbcon API as well as endangers other nbcon drivers that are doing things correctly (safely). As a compromise, allow nbcon drivers to implement unsafe ->write_atomic() callbacks by providing a new console flag CON_NBCON_ATOMIC_UNSAFE. When specified, the ->write_atomic() callback for that console will _only_ be called during the final "hope and pray" flush attempt at the end of a panic: nbcon_atomic_flush_unsafe(). Signed-off-by: John Ogness Link: https://lore.kernel.org/lkml/b2qps3uywhmjaym4mht2wpxul4yqtuuayeoq4iv4k3zf5wdgh3@tocu6c7mj4lt Reviewed-by: Petr Mladek Link: https://lore.kernel.org/all/swdpckuwwlv3uiessmtnf2jwlx3jusw6u7fpk5iggqo4t2vdws@7rpjso4gr7qp/ [1] Link: https://lore.kernel.org/all/20251103-fix_netpoll_aa-v4-1-4cfecdf6da7c@debian.org/ [2] Link: https://patch.msgid.link/20251027161212.334219-2-john.ogness@linutronix.de [pmladek@suse.com: Fix build with rework/nbcon-in-kdb branch.] Signed-off-by: Petr Mladek --- include/linux/console.h | 19 ++++++++++++++++--- kernel/printk/nbcon.c | 47 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 48 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index d17f1f525bec..5f17321ed962 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -186,6 +186,8 @@ static inline void con_debug_leave(void) { } * printing callbacks must not be called. * @CON_NBCON: Console can operate outside of the legacy style console_lock * constraints. + * @CON_NBCON_ATOMIC_UNSAFE: The write_atomic() callback is not safe and is + * therefore only used by nbcon_atomic_flush_unsafe(). */ enum cons_flags { CON_PRINTBUFFER = BIT(0), @@ -197,6 +199,7 @@ enum cons_flags { CON_EXTENDED = BIT(6), CON_SUSPENDED = BIT(7), CON_NBCON = BIT(8), + CON_NBCON_ATOMIC_UNSAFE = BIT(9), }; /** @@ -608,6 +611,7 @@ extern void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); +extern bool nbcon_allow_unsafe_takeover(void); extern bool nbcon_kdb_try_acquire(struct console *con, struct nbcon_write_context *wctxt); extern void nbcon_kdb_release(struct nbcon_write_context *wctxt); @@ -627,9 +631,18 @@ static inline bool console_is_usable(struct console *con, short flags, bool use_ return false; if (flags & CON_NBCON) { - /* The write_atomic() callback is optional. */ - if (use_atomic && !con->write_atomic) - return false; + if (use_atomic) { + /* The write_atomic() callback is optional. */ + if (!con->write_atomic) + return false; + + /* + * An unsafe write_atomic() callback is only usable + * when unsafe takeovers are allowed. + */ + if ((flags & CON_NBCON_ATOMIC_UNSAFE) && !nbcon_allow_unsafe_takeover()) + return false; + } /* * For the !use_atomic case, @printk_kthreads_running is not diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index fdd1cbebe77d..90412c2b2961 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -1408,6 +1408,26 @@ enum nbcon_prio nbcon_get_default_prio(void) return NBCON_PRIO_NORMAL; } +/* + * Track if it is allowed to perform unsafe hostile takeovers of console + * ownership. When true, console drivers might perform unsafe actions while + * printing. It is externally available via nbcon_allow_unsafe_takeover(). + */ +static bool panic_nbcon_allow_unsafe_takeover; + +/** + * nbcon_allow_unsafe_takeover - Check if unsafe console takeovers are allowed + * + * Return: True, when it is permitted to perform unsafe console printing + * + * This is also used by console_is_usable() to determine if it is allowed to + * call write_atomic() callbacks flagged as unsafe (CON_NBCON_ATOMIC_UNSAFE). + */ +bool nbcon_allow_unsafe_takeover(void) +{ + return panic_on_this_cpu() && panic_nbcon_allow_unsafe_takeover; +} + /** * nbcon_legacy_emit_next_record - Print one record for an nbcon console * in legacy contexts @@ -1478,7 +1498,6 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on * failure. @@ -1497,8 +1516,7 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, * returned, it cannot be expected that the unfinalized record will become * available. */ -static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, - bool allow_unsafe_takeover) +static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); @@ -1507,7 +1525,7 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, ctxt->console = con; ctxt->spinwait_max_us = 2000; ctxt->prio = nbcon_get_default_prio(); - ctxt->allow_unsafe_takeover = allow_unsafe_takeover; + ctxt->allow_unsafe_takeover = nbcon_allow_unsafe_takeover(); if (!nbcon_context_try_acquire(ctxt, false)) return -EPERM; @@ -1538,15 +1556,13 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * This will stop flushing before @stop_seq if another context has ownership. * That context is then responsible for the flushing. Likewise, if new records * are added while this context was flushing and there is no other context * to handle the printing, this context must also flush those records. */ -static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, - bool allow_unsafe_takeover) +static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct console_flush_type ft; unsigned long flags; @@ -1561,7 +1577,7 @@ again: */ local_irq_save(flags); - err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + err = __nbcon_atomic_flush_pending_con(con, stop_seq); local_irq_restore(flags); @@ -1593,9 +1609,8 @@ again: * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers */ -static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover) +static void __nbcon_atomic_flush_pending(u64 stop_seq) { struct console *con; int cookie; @@ -1613,7 +1628,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove if (nbcon_seq_read(con) >= stop_seq) continue; - nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + nbcon_atomic_flush_pending_con(con, stop_seq); } console_srcu_read_unlock(cookie); } @@ -1629,7 +1644,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove */ void nbcon_atomic_flush_pending(void) { - __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false); + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); } /** @@ -1641,7 +1656,9 @@ void nbcon_atomic_flush_pending(void) */ void nbcon_atomic_flush_unsafe(void) { - __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true); + panic_nbcon_allow_unsafe_takeover = true; + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); + panic_nbcon_allow_unsafe_takeover = false; } /** @@ -1848,7 +1865,7 @@ void nbcon_device_release(struct console *con) * using the legacy loop. */ if (ft.nbcon_atomic) { - __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false); + __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb)); } else if (ft.legacy_direct) { if (console_trylock()) console_unlock(); @@ -1918,5 +1935,5 @@ void nbcon_kdb_release(struct nbcon_write_context *wctxt) * The console was locked only when the write_atomic() callback * was usable. */ - __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb), false); + __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb)); } -- cgit v1.2.3 From 7ab06ea41af53aa1713186ceaa154179e4b0d4c9 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Wed, 5 Nov 2025 18:38:49 +0800 Subject: arch_topology: Provide a stub topology_core_has_smt() for !CONFIG_GENERIC_ARCH_TOPOLOGY The arm_pmu driver is using topology_core_has_smt() for retrieving the SMT implementation which depends on CONFIG_GENERIC_ARCH_TOPOLOGY. The config is optional on arm platforms so provide a !CONFIG_GENERIC_ARCH_TOPOLOGY stub for topology_core_has_smt(). Fixes: c3d78c34ad00 ("perf: arm_pmuv3: Don't use PMCCNTR_EL0 on SMT cores") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511041757.vuCGOmFc-lkp@intel.com/ Suggested-by: Will Deacon Signed-off-by: Yicong Yang Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- include/linux/arch_topology.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index daa1af2e8204..0c2a8b846c20 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -100,6 +100,10 @@ static inline bool topology_core_has_smt(int cpu) return cpu_topology[cpu].thread_id != -1; } -#endif +#else + +static inline bool topology_core_has_smt(int cpu) { return false; } + +#endif /* CONFIG_GENERIC_ARCH_TOPOLOGY */ #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ -- cgit v1.2.3 From 25976c314f6596254c9b1e2291d94393b7d5ae81 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 7 Nov 2025 15:38:44 +0900 Subject: block: introduce bdev_zone_start() Introduce the function bdev_zone_start() as a more explicit (and clear) replacement for ALIGN_DOWN() to get the start sector of a zone containing a particular sector of a zoned block device. Use this new helper in blkdev_get_zone_info() and blkdev_report_zones_cached(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-zoned.c | 4 ++-- include/linux/blkdev.h | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index b580d59ce210..3791755bc6ad 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -950,7 +950,7 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, return -EINVAL; memset(zone, 0, sizeof(*zone)); - sector = ALIGN_DOWN(sector, zone_sectors); + sector = bdev_zone_start(bdev, sector); if (!blkdev_has_cached_report_zones(bdev)) return blkdev_report_zone_fallback(bdev, sector, zone); @@ -1068,7 +1068,7 @@ int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, return blkdev_do_report_zones(bdev, sector, nr_zones, &args); } - for (sector = ALIGN_DOWN(sector, zone_sectors); + for (sector = bdev_zone_start(bdev, sector); sector < capacity && idx < nr_zones; sector += zone_sectors, idx++) { ret = blkdev_get_zone_info(bdev, sector, &zone); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a498aa7f7e7..2fff8a80dbd2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1522,6 +1522,12 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev) return q->limits.chunk_sectors; } +static inline sector_t bdev_zone_start(struct block_device *bdev, + sector_t sector) +{ + return sector & ~(bdev_zone_sectors(bdev) - 1); +} + static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev, sector_t sector) { -- cgit v1.2.3 From 9818af18db4bfefd320d0fef41390a616365e6f7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Nov 2025 11:50:00 +0100 Subject: compiler_types: Move unused static inline functions warning to W=2 Per Nathan, clang catches unused "static inline" functions in C files since commit 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build"). Linus said: > So I entirely ignore W=1 issues, because I think so many of the extra > warnings are bogus. > > But if this one in particular is causing more problems than most - > some teams do seem to use W=1 as part of their test builds - it's fine > to send me a patch that just moves bad warnings to W=2. > > And if anybody uses W=2 for their test builds, that's THEIR problem.. Here is the change to bump the warning from W=1 to W=2. Fixes: 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build") Signed-off-by: Peter Zijlstra Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20251106105000.2103276-1-andriy.shevchenko@linux.intel.com [nathan: Adjust comment as well] Signed-off-by: Nathan Chancellor --- include/linux/compiler_types.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 59288a2c1ad2..339603f05b54 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -250,10 +250,9 @@ struct ftrace_likely_data { /* * GCC does not warn about unused static inline functions for -Wunused-function. * Suppress the warning in clang as well by using __maybe_unused, but enable it - * for W=1 build. This will allow clang to find unused functions. Remove the - * __inline_maybe_unused entirely after fixing most of -Wunused-function warnings. + * for W=2 build. This will allow clang to find unused functions. */ -#ifdef KBUILD_EXTRA_WARN1 +#ifdef KBUILD_EXTRA_WARN2 #define __inline_maybe_unused #else #define __inline_maybe_unused __maybe_unused -- cgit v1.2.3 From b87ee13e34931779ac1dcd3264beba50b54966fd Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Wed, 5 Nov 2025 10:42:12 +0530 Subject: net: phy: phy-c45: add OATC14 10BASE-T1S PHY cable diagnostic support Add support for Open Alliance TC14 (OATC14) 10BASE-T1S PHYs cable diagnostic feature. This patch implements: - genphy_c45_oatc14_cable_test_start() to initiate a cable test - genphy_c45_oatc14_cable_test_get_status() to retrieve test results - Helper function to map PHY cable test status to ethtool result codes - Function declarations and exports for use by PHY drivers This enables ethtool to report ok, open, short, and undetectable cable conditions on OATC14 10Base-T1S PHYs. Open Alliance TC14 10BASE-T1S Advanced Diagnostic PHY Features Specification ref: https://opensig.org/wp-content/uploads/2025/06/OPEN_Alliance_10BASE-T1S_Advanced_PHY_features_for-automotive_Ethernet_V2.1b.pdf Signed-off-by: Parthiban Veerasooran Link: https://patch.msgid.link/20251105051213.50443-2-parthiban.veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mdio-open-alliance.h | 36 +++++++++++ drivers/net/phy/phy-c45.c | 122 +++++++++++++++++++++++++++++++++++ include/linux/phy.h | 3 + 3 files changed, 161 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/mdio-open-alliance.h b/drivers/net/phy/mdio-open-alliance.h index 931e14660d75..6850a3f0b31e 100644 --- a/drivers/net/phy/mdio-open-alliance.h +++ b/drivers/net/phy/mdio-open-alliance.h @@ -43,4 +43,40 @@ /* Version Identifiers */ #define OATC14_IDM 0x0a00 +/* + * Open Alliance TC14 (10BASE-T1S) - Advanced Diagnostic Features Registers + * + * Refer to the OPEN Alliance documentation: + * https://opensig.org/automotive-ethernet-specifications/ + * + * Specification: + * "10BASE-T1S Advanced Diagnostic PHY Features" + * https://opensig.org/wp-content/uploads/2025/06/OPEN_Alliance_10BASE-T1S_Advanced_PHY_features_for-automotive_Ethernet_V2.1b.pdf + */ +/* Advanced Diagnostic Features Capability Register*/ +#define MDIO_OATC14_ADFCAP 0xcc00 +#define OATC14_ADFCAP_HDD_CAPABILITY GENMASK(10, 8) + +/* Harness Defect Detection Register */ +#define MDIO_OATC14_HDD 0xcc01 +#define OATC14_HDD_CONTROL BIT(15) +#define OATC14_HDD_READY BIT(14) +#define OATC14_HDD_START_CONTROL BIT(13) +#define OATC14_HDD_VALID BIT(2) +#define OATC14_HDD_SHORT_OPEN_STATUS GENMASK(1, 0) + +/* Bus Short/Open Status: + * 0 0 - no fault; everything is ok. (Default) + * 0 1 - detected as an open or missing termination(s) + * 1 0 - detected as a short or extra termination(s) + * 1 1 - fault but fault type not detectable. More details can be available by + * vender specific register if supported. + */ +enum oatc14_hdd_status { + OATC14_HDD_STATUS_CABLE_OK = 0, + OATC14_HDD_STATUS_OPEN, + OATC14_HDD_STATUS_SHORT, + OATC14_HDD_STATUS_NOT_DETECTABLE, +}; + #endif /* __MDIO_OPEN_ALLIANCE__ */ diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 1a7b32be4625..e8e5be4684ab 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "mdio-open-alliance.h" #include "phylib-internal.h" @@ -1573,3 +1574,124 @@ int genphy_c45_ethtool_set_eee(struct phy_device *phydev, return ret; } EXPORT_SYMBOL(genphy_c45_ethtool_set_eee); + +/** + * oatc14_cable_test_get_result_code - Convert hardware cable test status to + * ethtool result code. + * @status: The hardware-reported cable test status + * + * This helper function maps the OATC14 HDD cable test status to the + * corresponding ethtool cable test result code. It provides a translation + * between the device-specific status values and the standardized ethtool + * result codes. + * + * Return: + * * ETHTOOL_A_CABLE_RESULT_CODE_OK - Cable is OK + * * ETHTOOL_A_CABLE_RESULT_CODE_OPEN - Open circuit detected + * * ETHTOOL_A_CABLE_RESULT_CODE_SAME_SHORT - Short circuit detected + * * ETHTOOL_A_CABLE_RESULT_CODE_UNSPEC - Status not detectable or invalid + */ +static int oatc14_cable_test_get_result_code(enum oatc14_hdd_status status) +{ + switch (status) { + case OATC14_HDD_STATUS_CABLE_OK: + return ETHTOOL_A_CABLE_RESULT_CODE_OK; + case OATC14_HDD_STATUS_OPEN: + return ETHTOOL_A_CABLE_RESULT_CODE_OPEN; + case OATC14_HDD_STATUS_SHORT: + return ETHTOOL_A_CABLE_RESULT_CODE_SAME_SHORT; + case OATC14_HDD_STATUS_NOT_DETECTABLE: + default: + return ETHTOOL_A_CABLE_RESULT_CODE_UNSPEC; + } +} + +/** + * genphy_c45_oatc14_cable_test_get_status - Get status of OATC14 10Base-T1S + * PHY cable test. + * @phydev: pointer to the PHY device structure + * @finished: pointer to a boolean set true if the test is complete + * + * Retrieves the current status of the OATC14 10Base-T1S PHY cable test. + * This function reads the OATC14 HDD register to determine whether the test + * results are valid and whether the test has finished. + * + * If the test is complete, the function reports the cable test result via + * the ethtool cable test interface using ethnl_cable_test_result(), and then + * clears the test control bit in the PHY register to reset the test state. + * + * Return: 0 on success, or a negative error code on failure (e.g. register + * read/write error). + */ +int genphy_c45_oatc14_cable_test_get_status(struct phy_device *phydev, + bool *finished) +{ + int ret; + u8 sts; + + *finished = false; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_HDD); + if (ret < 0) + return ret; + + if (!(ret & OATC14_HDD_VALID)) + return 0; + + *finished = true; + + sts = FIELD_GET(OATC14_HDD_SHORT_OPEN_STATUS, ret); + + ret = ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A, + oatc14_cable_test_get_result_code(sts)); + if (ret) + return ret; + + return phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, + MDIO_OATC14_HDD, OATC14_HDD_CONTROL); +} +EXPORT_SYMBOL(genphy_c45_oatc14_cable_test_get_status); + +/** + * genphy_c45_oatc14_cable_test_start - Start a cable test on an OATC14 + * 10Base-T1S PHY. + * @phydev: Pointer to the PHY device structure + * + * This function initiates a cable diagnostic test on a Clause 45 OATC14 + * 10Base-T1S capable PHY device. It first reads the PHY’s advanced diagnostic + * capability register to check if High Definition Diagnostics (HDD) mode is + * supported. If the PHY does not report HDD capability, cable testing is not + * supported and the function returns -EOPNOTSUPP. + * + * For PHYs that support HDD, the function sets the appropriate control bits in + * the OATC14_HDD register to enable and start the cable diagnostic test. + * + * Return: + * * 0 on success + * * -EOPNOTSUPP if the PHY does not support HDD capability + * * A negative error code on I/O or register access failures + */ +int genphy_c45_oatc14_cable_test_start(struct phy_device *phydev) +{ + int ret; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_ADFCAP); + if (ret < 0) + return ret; + + if (!(ret & OATC14_ADFCAP_HDD_CAPABILITY)) + return -EOPNOTSUPP; + + ret = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_HDD, + OATC14_HDD_CONTROL); + if (ret) + return ret; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_HDD); + if (ret < 0) + return ret; + + return phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MDIO_OATC14_HDD, + OATC14_HDD_START_CONTROL); +} +EXPORT_SYMBOL(genphy_c45_oatc14_cable_test_start); diff --git a/include/linux/phy.h b/include/linux/phy.h index d145a200ea21..bf5457341ca8 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2251,6 +2251,9 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev, int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); +int genphy_c45_oatc14_cable_test_start(struct phy_device *phydev); +int genphy_c45_oatc14_cable_test_get_status(struct phy_device *phydev, + bool *finished); /* The gen10g_* functions are the old Clause 45 stub */ int gen10g_config_aneg(struct phy_device *phydev); -- cgit v1.2.3 From f73e0f46bbfab29b111ff52d047f15aa13623972 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 5 Nov 2025 23:09:17 +0100 Subject: net: phy: fixed_phy: shrink size of struct fixed_phy_status All three members are effectively of type bool, so make this explicit and shrink size of struct fixed_phy_status. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/9eca3d7e-fa64-4724-8fdc-f2c1a8f2ae8f@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/fixed_phy.c | 2 +- include/linux/phy_fixed.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index d498d8a9bba6..9bd6937411e4 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -224,7 +224,7 @@ EXPORT_SYMBOL_GPL(fixed_phy_register); struct phy_device *fixed_phy_register_100fd(void) { static const struct fixed_phy_status status = { - .link = 1, + .link = true, .speed = SPEED_100, .duplex = DUPLEX_FULL, }; diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 8bade999831c..436bff20f324 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -5,11 +5,11 @@ #include struct fixed_phy_status { - int link; int speed; int duplex; - int pause; - int asym_pause; + bool link:1; + bool pause:1; + bool asym_pause:1; }; struct device_node; -- cgit v1.2.3 From bf9f0b00bb7fd0470c1255bcc8e76c81d122a609 Mon Sep 17 00:00:00 2001 From: Jai Luthra Date: Wed, 29 Oct 2025 16:00:08 +0530 Subject: include: linux: Destage VCHIQ interface headers Move the VCHIQ headers from drivers/staging/vc04_services/include to include/linux/raspberrypi This is done so that they can be shared between the VCHIQ interface (which is going to be de-staged in a subsequent commit from staging) and the VCHIQ drivers left in the staging/vc04_services (namely bcm2835-audio, bcm2835-camera). The include/linux/raspberrypi/ provides a central location to serve both of these areas. Co-developed-by: Umang Jain Signed-off-by: Umang Jain Reviewed-by: Laurent Pinchart Signed-off-by: Jai Luthra Link: https://patch.msgid.link/20251029-vchiq-destage-v3-4-da8d6c83c2c5@ideasonboard.com Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 1 + .../vc04_services/bcm2835-audio/bcm2835-vchiq.c | 5 +- .../staging/vc04_services/bcm2835-audio/bcm2835.c | 3 +- .../staging/vc04_services/bcm2835-audio/bcm2835.h | 3 +- .../include/linux/raspberrypi/vchiq.h | 112 ---- .../vc04_services/interface/vchiq_arm/vchiq_arm.c | 9 +- .../vc04_services/interface/vchiq_arm/vchiq_arm.h | 164 ------ .../vc04_services/interface/vchiq_arm/vchiq_bus.c | 4 +- .../vc04_services/interface/vchiq_arm/vchiq_bus.h | 60 -- .../vc04_services/interface/vchiq_arm/vchiq_cfg.h | 41 -- .../vc04_services/interface/vchiq_arm/vchiq_core.c | 4 +- .../vc04_services/interface/vchiq_arm/vchiq_core.h | 646 --------------------- .../interface/vchiq_arm/vchiq_debugfs.c | 6 +- .../interface/vchiq_arm/vchiq_debugfs.h | 22 - .../vc04_services/interface/vchiq_arm/vchiq_dev.c | 7 +- .../interface/vchiq_arm/vchiq_ioctl.h | 3 +- .../staging/vc04_services/vchiq-mmal/mmal-vchiq.c | 5 +- include/linux/raspberrypi/vchiq.h | 112 ++++ include/linux/raspberrypi/vchiq_arm.h | 164 ++++++ include/linux/raspberrypi/vchiq_bus.h | 60 ++ include/linux/raspberrypi/vchiq_cfg.h | 41 ++ include/linux/raspberrypi/vchiq_core.h | 646 +++++++++++++++++++++ include/linux/raspberrypi/vchiq_debugfs.h | 22 + 23 files changed, 1072 insertions(+), 1068 deletions(-) delete mode 100644 drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h create mode 100644 include/linux/raspberrypi/vchiq.h create mode 100644 include/linux/raspberrypi/vchiq_arm.h create mode 100644 include/linux/raspberrypi/vchiq_bus.h create mode 100644 include/linux/raspberrypi/vchiq_cfg.h create mode 100644 include/linux/raspberrypi/vchiq_core.h create mode 100644 include/linux/raspberrypi/vchiq_debugfs.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 3da2c26a796b..cd223e119d48 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4829,6 +4829,7 @@ T: git https://github.com/broadcom/stblinux.git F: Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml F: drivers/pci/controller/pcie-brcmstb.c F: drivers/staging/vc04_services +F: include/linux/raspberrypi/vchiq* N: bcm2711 N: bcm2712 N: bcm283* diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c b/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c index 0dbe76ee5570..7368b384497f 100644 --- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c +++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c @@ -4,11 +4,12 @@ #include #include #include + +#include + #include "bcm2835.h" #include "vc_vchi_audioserv_defs.h" -#include "../interface/vchiq_arm/vchiq_arm.h" - struct bcm2835_audio_instance { struct device *dev; unsigned int service_handle; diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c index b74cb104e9de..f292a6618166 100644 --- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c +++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c @@ -6,7 +6,8 @@ #include #include -#include "../interface/vchiq_arm/vchiq_bus.h" +#include + #include "bcm2835.h" static bool enable_hdmi; diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h index 49ec5b496edb..5a1348747ff4 100644 --- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h +++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h @@ -5,13 +5,12 @@ #define __SOUND_ARM_BCM2835_H #include +#include #include #include #include #include -#include "../include/linux/raspberrypi/vchiq.h" - #define MAX_SUBSTREAMS (8) #define AVAIL_SUBSTREAMS_MASK (0xff) diff --git a/drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h b/drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h deleted file mode 100644 index ee4469f4fc51..000000000000 --- a/drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h +++ /dev/null @@ -1,112 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */ - -#ifndef VCHIQ_H -#define VCHIQ_H - -#define VCHIQ_MAKE_FOURCC(x0, x1, x2, x3) \ - (((x0) << 24) | ((x1) << 16) | ((x2) << 8) | (x3)) - -enum vchiq_reason { - VCHIQ_SERVICE_OPENED, /* service, -, - */ - VCHIQ_SERVICE_CLOSED, /* service, -, - */ - VCHIQ_MESSAGE_AVAILABLE, /* service, header, - */ - VCHIQ_BULK_TRANSMIT_DONE, /* service, -, bulk_userdata */ - VCHIQ_BULK_RECEIVE_DONE, /* service, -, bulk_userdata */ - VCHIQ_BULK_TRANSMIT_ABORTED, /* service, -, bulk_userdata */ - VCHIQ_BULK_RECEIVE_ABORTED /* service, -, bulk_userdata */ -}; - -enum vchiq_bulk_mode { - VCHIQ_BULK_MODE_CALLBACK, - VCHIQ_BULK_MODE_BLOCKING, - VCHIQ_BULK_MODE_NOCALLBACK, - VCHIQ_BULK_MODE_WAITING /* Reserved for internal use */ -}; - -enum vchiq_service_option { - VCHIQ_SERVICE_OPTION_AUTOCLOSE, - VCHIQ_SERVICE_OPTION_SLOT_QUOTA, - VCHIQ_SERVICE_OPTION_MESSAGE_QUOTA, - VCHIQ_SERVICE_OPTION_SYNCHRONOUS, - VCHIQ_SERVICE_OPTION_TRACE -}; - -struct vchiq_header { - /* The message identifier - opaque to applications. */ - int msgid; - - /* Size of message data. */ - unsigned int size; - - char data[]; /* message */ -}; - -struct vchiq_element { - const void __user *data; - unsigned int size; -}; - -struct vchiq_instance; -struct vchiq_state; - -struct vchiq_service_base { - int fourcc; - int (*callback)(struct vchiq_instance *instance, - enum vchiq_reason reason, - struct vchiq_header *header, - unsigned int handle, - void *cb_data, void __user *cb_userdata); - void *userdata; -}; - -struct vchiq_completion_data_kernel { - enum vchiq_reason reason; - struct vchiq_header *header; - void *service_userdata; - void *cb_data; - void __user *cb_userdata; -}; - -struct vchiq_service_params_kernel { - int fourcc; - int (*callback)(struct vchiq_instance *instance, - enum vchiq_reason reason, - struct vchiq_header *header, - unsigned int handle, - void *cb_data, void __user *cb_userdata); - void *userdata; - short version; /* Increment for non-trivial changes */ - short version_min; /* Update for incompatible changes */ -}; - -extern int vchiq_initialise(struct vchiq_state *state, - struct vchiq_instance **pinstance); -extern int vchiq_shutdown(struct vchiq_instance *instance); -extern int vchiq_connect(struct vchiq_instance *instance); -extern int vchiq_open_service(struct vchiq_instance *instance, - const struct vchiq_service_params_kernel *params, - unsigned int *pservice); -extern int vchiq_close_service(struct vchiq_instance *instance, - unsigned int service); -extern int vchiq_use_service(struct vchiq_instance *instance, unsigned int service); -extern int vchiq_release_service(struct vchiq_instance *instance, - unsigned int service); -extern void vchiq_msg_queue_push(struct vchiq_instance *instance, unsigned int handle, - struct vchiq_header *header); -extern void vchiq_release_message(struct vchiq_instance *instance, unsigned int service, - struct vchiq_header *header); -extern int vchiq_queue_kernel_message(struct vchiq_instance *instance, unsigned int handle, - void *data, unsigned int size); -extern int vchiq_bulk_transmit(struct vchiq_instance *instance, unsigned int service, - const void *data, unsigned int size, void *userdata, - enum vchiq_bulk_mode mode); -extern int vchiq_bulk_receive(struct vchiq_instance *instance, unsigned int service, - void *data, unsigned int size, void *userdata, - enum vchiq_bulk_mode mode); -extern void *vchiq_get_service_userdata(struct vchiq_instance *instance, unsigned int service); -extern int vchiq_get_peer_version(struct vchiq_instance *instance, unsigned int handle, - short *peer_version); -extern struct vchiq_header *vchiq_msg_hold(struct vchiq_instance *instance, unsigned int handle); - -#endif /* VCHIQ_H */ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index a2074069e79e..6a7b96d3dae6 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -30,11 +30,12 @@ #include #include -#include "vchiq_core.h" +#include +#include +#include +#include + #include "vchiq_ioctl.h" -#include "vchiq_arm.h" -#include "vchiq_bus.h" -#include "vchiq_debugfs.h" #define DEVICE_NAME "vchiq" diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h deleted file mode 100644 index e32b02f99024..000000000000 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h +++ /dev/null @@ -1,164 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. - * Copyright (c) 2010-2012 Broadcom. All rights reserved. - */ - -#ifndef VCHIQ_ARM_H -#define VCHIQ_ARM_H - -#include -#include -#include -#include -#include "vchiq_core.h" -#include "vchiq_debugfs.h" - -/* Some per-instance constants */ -#define MAX_COMPLETIONS 128 -#define MAX_SERVICES 64 -#define MAX_ELEMENTS 8 -#define MSG_QUEUE_SIZE 128 - -#define VCHIQ_DRV_MAX_CALLBACKS 10 - -struct rpi_firmware; -struct vchiq_device; - -enum USE_TYPE_E { - USE_TYPE_SERVICE, - USE_TYPE_VCHIQ -}; - -struct vchiq_platform_info { - unsigned int cache_line_size; -}; - -struct vchiq_drv_mgmt { - struct rpi_firmware *fw; - const struct vchiq_platform_info *info; - - bool connected; - int num_deferred_callbacks; - /* Protects connected and num_deferred_callbacks */ - struct mutex connected_mutex; - - void (*deferred_callback[VCHIQ_DRV_MAX_CALLBACKS])(void); - - struct semaphore free_fragments_sema; - struct semaphore free_fragments_mutex; - char *fragments_base; - char *free_fragments; - unsigned int fragments_size; - - void __iomem *regs; - - struct vchiq_state state; -}; - -struct user_service { - struct vchiq_service *service; - void __user *userdata; - struct vchiq_instance *instance; - char is_vchi; - char dequeue_pending; - char close_pending; - int message_available_pos; - int msg_insert; - int msg_remove; - struct completion insert_event; - struct completion remove_event; - struct completion close_event; - struct vchiq_header *msg_queue[MSG_QUEUE_SIZE]; -}; - -struct bulk_waiter_node { - struct bulk_waiter bulk_waiter; - int pid; - struct list_head list; -}; - -struct vchiq_instance { - struct vchiq_state *state; - struct vchiq_completion_data_kernel completions[MAX_COMPLETIONS]; - int completion_insert; - int completion_remove; - struct completion insert_event; - struct completion remove_event; - struct mutex completion_mutex; - - int connected; - int closing; - int pid; - int mark; - int use_close_delivered; - int trace; - - struct list_head bulk_waiter_list; - struct mutex bulk_waiter_list_mutex; - - struct vchiq_debugfs_node debugfs_node; -}; - -int -vchiq_use_service(struct vchiq_instance *instance, unsigned int handle); - -extern int -vchiq_release_service(struct vchiq_instance *instance, unsigned int handle); - -extern int -vchiq_check_service(struct vchiq_service *service); - -extern void -vchiq_dump_service_use_state(struct vchiq_state *state); - -extern int -vchiq_use_internal(struct vchiq_state *state, struct vchiq_service *service, - enum USE_TYPE_E use_type); -extern int -vchiq_release_internal(struct vchiq_state *state, - struct vchiq_service *service); - -extern struct vchiq_debugfs_node * -vchiq_instance_get_debugfs_node(struct vchiq_instance *instance); - -extern int -vchiq_instance_get_use_count(struct vchiq_instance *instance); - -extern int -vchiq_instance_get_pid(struct vchiq_instance *instance); - -extern int -vchiq_instance_get_trace(struct vchiq_instance *instance); - -extern void -vchiq_instance_set_trace(struct vchiq_instance *instance, int trace); - -extern void -vchiq_add_connected_callback(struct vchiq_device *device, - void (*callback)(void)); - -#if IS_ENABLED(CONFIG_VCHIQ_CDEV) - -extern void -vchiq_deregister_chrdev(void); - -extern int -vchiq_register_chrdev(struct device *parent); - -#else - -static inline void vchiq_deregister_chrdev(void) { } -static inline int vchiq_register_chrdev(struct device *parent) { return 0; } - -#endif /* IS_ENABLED(CONFIG_VCHIQ_CDEV) */ - -extern int -service_callback(struct vchiq_instance *vchiq_instance, enum vchiq_reason reason, - struct vchiq_header *header, unsigned int handle, - void *cb_data, void __user *cb_userdata); - -extern void -free_bulk_waiter(struct vchiq_instance *instance); - -#endif /* VCHIQ_ARM_H */ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c index 41ece91ab88a..f50e637d505c 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c @@ -11,8 +11,8 @@ #include #include -#include "vchiq_arm.h" -#include "vchiq_bus.h" +#include +#include static int vchiq_bus_type_match(struct device *dev, const struct device_driver *drv) { diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h deleted file mode 100644 index 9de179b39f85..000000000000 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2023 Ideas On Board Oy - */ - -#ifndef _VCHIQ_DEVICE_H -#define _VCHIQ_DEVICE_H - -#include -#include - -struct vchiq_drv_mgmt; - -struct vchiq_device { - struct device dev; - struct vchiq_drv_mgmt *drv_mgmt; -}; - -struct vchiq_driver { - int (*probe)(struct vchiq_device *device); - void (*remove)(struct vchiq_device *device); - int (*resume)(struct vchiq_device *device); - int (*suspend)(struct vchiq_device *device, - pm_message_t state); - - const struct vchiq_device_id *id_table; - struct device_driver driver; -}; - -static inline struct vchiq_device *to_vchiq_device(struct device *d) -{ - return container_of(d, struct vchiq_device, dev); -} - -static inline struct vchiq_driver *to_vchiq_driver(struct device_driver *d) -{ - return container_of(d, struct vchiq_driver, driver); -} - -extern const struct bus_type vchiq_bus_type; - -struct vchiq_device * -vchiq_device_register(struct device *parent, const char *name); -void vchiq_device_unregister(struct vchiq_device *dev); - -int vchiq_driver_register(struct vchiq_driver *vchiq_drv); -void vchiq_driver_unregister(struct vchiq_driver *vchiq_drv); - -/** - * module_vchiq_driver() - Helper macro for registering a vchiq driver - * @__vchiq_driver: vchiq driver struct - * - * Helper macro for vchiq drivers which do not do anything special in - * module init/exit. This eliminates a lot of boilerplate. Each module may only - * use this macro once, and calling it replaces module_init() and module_exit() - */ -#define module_vchiq_driver(__vchiq_driver) \ - module_driver(__vchiq_driver, vchiq_driver_register, vchiq_driver_unregister) - -#endif /* _VCHIQ_DEVICE_H */ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h deleted file mode 100644 index a16d0299996c..000000000000 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* Copyright (c) 2010-2014 Broadcom. All rights reserved. */ - -#ifndef VCHIQ_CFG_H -#define VCHIQ_CFG_H - -#define VCHIQ_MAGIC VCHIQ_MAKE_FOURCC('V', 'C', 'H', 'I') -/* The version of VCHIQ - change with any non-trivial change */ -#define VCHIQ_VERSION 8 -/* - * The minimum compatible version - update to match VCHIQ_VERSION with any - * incompatible change - */ -#define VCHIQ_VERSION_MIN 3 - -/* The version that introduced the VCHIQ_IOC_LIB_VERSION ioctl */ -#define VCHIQ_VERSION_LIB_VERSION 7 - -/* The version that introduced the VCHIQ_IOC_CLOSE_DELIVERED ioctl */ -#define VCHIQ_VERSION_CLOSE_DELIVERED 7 - -/* The version that made it safe to use SYNCHRONOUS mode */ -#define VCHIQ_VERSION_SYNCHRONOUS_MODE 8 - -#define VCHIQ_MAX_STATES 1 -#define VCHIQ_MAX_SERVICES 4096 -#define VCHIQ_MAX_SLOTS 128 -#define VCHIQ_MAX_SLOTS_PER_SIDE 64 - -#define VCHIQ_NUM_CURRENT_BULKS 32 -#define VCHIQ_NUM_SERVICE_BULKS 4 - -#ifndef VCHIQ_ENABLE_DEBUG -#define VCHIQ_ENABLE_DEBUG 1 -#endif - -#ifndef VCHIQ_ENABLE_STATS -#define VCHIQ_ENABLE_STATS 1 -#endif - -#endif /* VCHIQ_CFG_H */ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c index 130be2f58342..83de27cfd469 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c @@ -15,8 +15,8 @@ #include #include -#include "vchiq_arm.h" -#include "vchiq_core.h" +#include +#include #define VCHIQ_SLOT_HANDLER_STACK 8192 diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h deleted file mode 100644 index e3ed50d26c37..000000000000 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h +++ /dev/null @@ -1,646 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */ - -#ifndef VCHIQ_CORE_H -#define VCHIQ_CORE_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../../include/linux/raspberrypi/vchiq.h" -#include "vchiq_cfg.h" - -/* Do this so that we can test-build the code on non-rpi systems */ -#if IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) - -#else - -#ifndef dsb -#define dsb(a) -#endif - -#endif /* IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) */ - -#define VCHIQ_SERVICE_HANDLE_INVALID 0 - -#define VCHIQ_SLOT_SIZE 4096 -#define VCHIQ_MAX_MSG_SIZE (VCHIQ_SLOT_SIZE - sizeof(struct vchiq_header)) - -#define VCHIQ_SLOT_MASK (VCHIQ_SLOT_SIZE - 1) -#define VCHIQ_SLOT_QUEUE_MASK (VCHIQ_MAX_SLOTS_PER_SIDE - 1) -#define VCHIQ_SLOT_ZERO_SLOTS DIV_ROUND_UP(sizeof(struct vchiq_slot_zero), \ - VCHIQ_SLOT_SIZE) - -#define BITSET_SIZE(b) ((b + 31) >> 5) -#define BITSET_WORD(b) (b >> 5) -#define BITSET_BIT(b) (1 << (b & 31)) -#define BITSET_IS_SET(bs, b) (bs[BITSET_WORD(b)] & BITSET_BIT(b)) -#define BITSET_SET(bs, b) (bs[BITSET_WORD(b)] |= BITSET_BIT(b)) - -enum { - DEBUG_ENTRIES, -#if VCHIQ_ENABLE_DEBUG - DEBUG_SLOT_HANDLER_COUNT, - DEBUG_SLOT_HANDLER_LINE, - DEBUG_PARSE_LINE, - DEBUG_PARSE_HEADER, - DEBUG_PARSE_MSGID, - DEBUG_AWAIT_COMPLETION_LINE, - DEBUG_DEQUEUE_MESSAGE_LINE, - DEBUG_SERVICE_CALLBACK_LINE, - DEBUG_MSG_QUEUE_FULL_COUNT, - DEBUG_COMPLETION_QUEUE_FULL_COUNT, -#endif - DEBUG_MAX -}; - -#if VCHIQ_ENABLE_DEBUG - -#define DEBUG_INITIALISE(local) int *debug_ptr = (local)->debug -#define DEBUG_TRACE(d) \ - do { debug_ptr[DEBUG_ ## d] = __LINE__; dsb(sy); } while (0) -#define DEBUG_VALUE(d, v) \ - do { debug_ptr[DEBUG_ ## d] = (v); dsb(sy); } while (0) -#define DEBUG_COUNT(d) \ - do { debug_ptr[DEBUG_ ## d]++; dsb(sy); } while (0) - -#else /* VCHIQ_ENABLE_DEBUG */ - -#define DEBUG_INITIALISE(local) -#define DEBUG_TRACE(d) -#define DEBUG_VALUE(d, v) -#define DEBUG_COUNT(d) - -#endif /* VCHIQ_ENABLE_DEBUG */ - -enum vchiq_connstate { - VCHIQ_CONNSTATE_DISCONNECTED, - VCHIQ_CONNSTATE_CONNECTING, - VCHIQ_CONNSTATE_CONNECTED, - VCHIQ_CONNSTATE_PAUSING, - VCHIQ_CONNSTATE_PAUSE_SENT, - VCHIQ_CONNSTATE_PAUSED, - VCHIQ_CONNSTATE_RESUMING, - VCHIQ_CONNSTATE_PAUSE_TIMEOUT, - VCHIQ_CONNSTATE_RESUME_TIMEOUT -}; - -enum { - VCHIQ_SRVSTATE_FREE, - VCHIQ_SRVSTATE_HIDDEN, - VCHIQ_SRVSTATE_LISTENING, - VCHIQ_SRVSTATE_OPENING, - VCHIQ_SRVSTATE_OPEN, - VCHIQ_SRVSTATE_OPENSYNC, - VCHIQ_SRVSTATE_CLOSESENT, - VCHIQ_SRVSTATE_CLOSERECVD, - VCHIQ_SRVSTATE_CLOSEWAIT, - VCHIQ_SRVSTATE_CLOSED -}; - -enum vchiq_bulk_dir { - VCHIQ_BULK_TRANSMIT, - VCHIQ_BULK_RECEIVE -}; - -struct vchiq_bulk { - short mode; - short dir; - void *cb_data; - void __user *cb_userdata; - struct bulk_waiter *waiter; - dma_addr_t dma_addr; - int size; - void *remote_data; - int remote_size; - int actual; - void *offset; - void __user *uoffset; -}; - -struct vchiq_bulk_queue { - int local_insert; /* Where to insert the next local bulk */ - int remote_insert; /* Where to insert the next remote bulk (master) */ - int process; /* Bulk to transfer next */ - int remote_notify; /* Bulk to notify the remote client of next (mstr) */ - int remove; /* Bulk to notify the local client of, and remove, next */ - struct vchiq_bulk bulks[VCHIQ_NUM_SERVICE_BULKS]; -}; - -/* - * Remote events provide a way of presenting several virtual doorbells to a - * peer (ARM host to VPU) using only one physical doorbell. They can be thought - * of as a way for the peer to signal a semaphore, in this case implemented as - * a workqueue. - * - * Remote events remain signalled until acknowledged by the receiver, and they - * are non-counting. They are designed in such a way as to minimise the number - * of interrupts and avoid unnecessary waiting. - * - * A remote_event is as small data structures that live in shared memory. It - * comprises two booleans - armed and fired: - * - * The sender sets fired when they signal the receiver. - * If fired is set, the receiver has been signalled and need not wait. - * The receiver sets the armed field before they begin to wait. - * If armed is set, the receiver is waiting and wishes to be woken by interrupt. - */ -struct remote_event { - int armed; - int fired; - u32 __unused; -}; - -struct opaque_platform_state; - -struct vchiq_slot { - char data[VCHIQ_SLOT_SIZE]; -}; - -struct vchiq_slot_info { - /* Use two counters rather than one to avoid the need for a mutex. */ - short use_count; - short release_count; -}; - -/* - * VCHIQ is a reliable connection-oriented datagram protocol. - * - * A VCHIQ service is equivalent to a TCP connection, except: - * + FOURCCs are used for the rendezvous, and port numbers are assigned at the - * time the connection is established. - * + There is less of a distinction between server and client sockets, the only - * difference being which end makes the first move. - * + For a multi-client server, the server creates new "listening" services as - * the existing one becomes connected - there is no need to specify the - * maximum number of clients up front. - * + Data transfer is reliable but packetized (messages have defined ends). - * + Messages can be either short (capable of fitting in a slot) and in-band, - * or copied between external buffers (bulk transfers). - */ -struct vchiq_service { - struct vchiq_service_base base; - unsigned int handle; - struct kref ref_count; - struct rcu_head rcu; - int srvstate; - void (*userdata_term)(void *userdata); - unsigned int localport; - unsigned int remoteport; - int public_fourcc; - int client_id; - char auto_close; - char sync; - char closing; - char trace; - atomic_t poll_flags; - short version; - short version_min; - short peer_version; - - struct vchiq_state *state; - struct vchiq_instance *instance; - - int service_use_count; - - struct vchiq_bulk_queue bulk_tx; - struct vchiq_bulk_queue bulk_rx; - - struct completion remove_event; - struct completion bulk_remove_event; - struct mutex bulk_mutex; - - struct service_stats_struct { - int quota_stalls; - int slot_stalls; - int bulk_stalls; - int error_count; - int ctrl_tx_count; - int ctrl_rx_count; - int bulk_tx_count; - int bulk_rx_count; - int bulk_aborted_count; - u64 ctrl_tx_bytes; - u64 ctrl_rx_bytes; - u64 bulk_tx_bytes; - u64 bulk_rx_bytes; - } stats; - - int msg_queue_read; - int msg_queue_write; - struct completion msg_queue_pop; - struct completion msg_queue_push; - struct vchiq_header *msg_queue[VCHIQ_MAX_SLOTS]; -}; - -/* - * The quota information is outside struct vchiq_service so that it can - * be statically allocated, since for accounting reasons a service's slot - * usage is carried over between users of the same port number. - */ -struct vchiq_service_quota { - unsigned short slot_quota; - unsigned short slot_use_count; - unsigned short message_quota; - unsigned short message_use_count; - struct completion quota_event; - int previous_tx_index; -}; - -struct vchiq_shared_state { - /* A non-zero value here indicates that the content is valid. */ - int initialised; - - /* The first and last (inclusive) slots allocated to the owner. */ - int slot_first; - int slot_last; - - /* The slot allocated to synchronous messages from the owner. */ - int slot_sync; - - /* - * Signalling this event indicates that owner's slot handler thread - * should run. - */ - struct remote_event trigger; - - /* - * Indicates the byte position within the stream where the next message - * will be written. The least significant bits are an index into the - * slot. The next bits are the index of the slot in slot_queue. - */ - int tx_pos; - - /* This event should be signalled when a slot is recycled. */ - struct remote_event recycle; - - /* The slot_queue index where the next recycled slot will be written. */ - int slot_queue_recycle; - - /* This event should be signalled when a synchronous message is sent. */ - struct remote_event sync_trigger; - - /* - * This event should be signalled when a synchronous message has been - * released. - */ - struct remote_event sync_release; - - /* A circular buffer of slot indexes. */ - int slot_queue[VCHIQ_MAX_SLOTS_PER_SIDE]; - - /* Debugging state */ - int debug[DEBUG_MAX]; -}; - -/* - * vchiq_slot_zero describes the memory shared between the ARM host and the - * VideoCore VPU. The "master" and "slave" states are owned by the respective - * sides but visible to the other; the slots are shared, and the remaining - * fields are read-only. - * - * In the configuration used by this implementation, the memory is allocated - * by the host, the VPU is the master (the side which controls the DMA for bulk - * transfers), and the host is the slave. - * - * The ownership of slots changes with use: - * + When empty they are owned by the sender. - * + When partially filled they are shared with the receiver. - * + When completely full they are owned by the receiver. - * + When the receiver has finished processing the contents, they are recycled - * back to the sender. - */ -struct vchiq_slot_zero { - int magic; - short version; - short version_min; - int slot_zero_size; - int slot_size; - int max_slots; - int max_slots_per_side; - int platform_data[2]; - struct vchiq_shared_state master; - struct vchiq_shared_state slave; - struct vchiq_slot_info slots[VCHIQ_MAX_SLOTS]; -}; - -/* - * This is the private runtime state used by each side. The same structure was - * originally used by both sides, but implementations have since diverged. - */ -struct vchiq_state { - struct device *dev; - int id; - int initialised; - enum vchiq_connstate conn_state; - short version_common; - - struct vchiq_shared_state *local; - struct vchiq_shared_state *remote; - struct vchiq_slot *slot_data; - - unsigned short default_slot_quota; - unsigned short default_message_quota; - - /* Event indicating connect message received */ - struct completion connect; - - /* Mutex protecting services */ - struct mutex mutex; - struct vchiq_instance **instance; - - /* Processes all incoming messages which aren't synchronous */ - struct task_struct *slot_handler_thread; - - /* - * Slots which have been fully processed and released by the (peer) - * receiver are added to the receiver queue, which is asynchronously - * processed by the recycle thread. - */ - struct task_struct *recycle_thread; - - /* - * Processes incoming synchronous messages - * - * The synchronous message channel is shared between all synchronous - * services, and provides a way for urgent messages to bypass - * potentially long queues of asynchronous messages in the normal slots. - * - * There can be only one outstanding synchronous message in - * each direction, and as a precious shared resource synchronous - * services should be used sparingly. - */ - struct task_struct *sync_thread; - - /* Local implementation of the trigger remote event */ - wait_queue_head_t trigger_event; - - /* Local implementation of the recycle remote event */ - wait_queue_head_t recycle_event; - - /* Local implementation of the sync trigger remote event */ - wait_queue_head_t sync_trigger_event; - - /* Local implementation of the sync release remote event */ - wait_queue_head_t sync_release_event; - - char *tx_data; - char *rx_data; - struct vchiq_slot_info *rx_info; - - struct mutex slot_mutex; - - struct mutex recycle_mutex; - - struct mutex sync_mutex; - - spinlock_t msg_queue_spinlock; - - spinlock_t bulk_waiter_spinlock; - - spinlock_t quota_spinlock; - - /* - * Indicates the byte position within the stream from where the next - * message will be read. The least significant bits are an index into - * the slot.The next bits are the index of the slot in - * remote->slot_queue. - */ - int rx_pos; - - /* - * A cached copy of local->tx_pos. Only write to local->tx_pos, and read - * from remote->tx_pos. - */ - int local_tx_pos; - - /* The slot_queue index of the slot to become available next. */ - int slot_queue_available; - - /* A flag to indicate if any poll has been requested */ - int poll_needed; - - /* Ths index of the previous slot used for data messages. */ - int previous_data_index; - - /* The number of slots occupied by data messages. */ - unsigned short data_use_count; - - /* The maximum number of slots to be occupied by data messages. */ - unsigned short data_quota; - - /* An array of bit sets indicating which services must be polled. */ - atomic_t poll_services[BITSET_SIZE(VCHIQ_MAX_SERVICES)]; - - /* The number of the first unused service */ - int unused_service; - - /* Signalled when a free slot becomes available. */ - struct completion slot_available_event; - - /* Signalled when a free data slot becomes available. */ - struct completion data_quota_event; - - struct state_stats_struct { - int slot_stalls; - int data_stalls; - int ctrl_tx_count; - int ctrl_rx_count; - int error_count; - } stats; - - struct vchiq_service __rcu *services[VCHIQ_MAX_SERVICES]; - struct vchiq_service_quota service_quotas[VCHIQ_MAX_SERVICES]; - struct vchiq_slot_info slot_info[VCHIQ_MAX_SLOTS]; - - struct opaque_platform_state *platform_state; -}; - -struct pagelist { - u32 length; - u16 type; - u16 offset; - u32 addrs[1]; /* N.B. 12 LSBs hold the number - * of following pages at consecutive - * addresses. - */ -}; - -struct vchiq_pagelist_info { - struct pagelist *pagelist; - size_t pagelist_buffer_size; - dma_addr_t dma_addr; - enum dma_data_direction dma_dir; - unsigned int num_pages; - unsigned int pages_need_release; - struct page **pages; - struct scatterlist *scatterlist; - unsigned int scatterlist_mapped; -}; - -static inline bool vchiq_remote_initialised(const struct vchiq_state *state) -{ - return state->remote && state->remote->initialised; -} - -struct bulk_waiter { - struct vchiq_bulk *bulk; - struct completion event; - int actual; -}; - -struct vchiq_config { - unsigned int max_msg_size; - unsigned int bulk_threshold; /* The message size above which it - * is better to use a bulk transfer - * (<= max_msg_size) - */ - unsigned int max_outstanding_bulks; - unsigned int max_services; - short version; /* The version of VCHIQ */ - short version_min; /* The minimum compatible version of VCHIQ */ -}; - -extern spinlock_t bulk_waiter_spinlock; - -extern const char * -get_conn_state_name(enum vchiq_connstate conn_state); - -extern struct vchiq_slot_zero * -vchiq_init_slots(struct device *dev, void *mem_base, int mem_size); - -extern int -vchiq_init_state(struct vchiq_state *state, struct vchiq_slot_zero *slot_zero, struct device *dev); - -extern int -vchiq_connect_internal(struct vchiq_state *state, struct vchiq_instance *instance); - -struct vchiq_service * -vchiq_add_service_internal(struct vchiq_state *state, - const struct vchiq_service_params_kernel *params, - int srvstate, struct vchiq_instance *instance, - void (*userdata_term)(void *userdata)); - -extern int -vchiq_open_service_internal(struct vchiq_service *service, int client_id); - -extern int -vchiq_close_service_internal(struct vchiq_service *service, int close_recvd); - -extern void -vchiq_terminate_service_internal(struct vchiq_service *service); - -extern void -vchiq_free_service_internal(struct vchiq_service *service); - -extern void -vchiq_shutdown_internal(struct vchiq_state *state, struct vchiq_instance *instance); - -extern void -remote_event_pollall(struct vchiq_state *state); - -extern int -vchiq_bulk_xfer_waiting(struct vchiq_instance *instance, unsigned int handle, - struct bulk_waiter *userdata); - -extern int -vchiq_bulk_xfer_blocking(struct vchiq_instance *instance, unsigned int handle, - struct vchiq_bulk *bulk); - -extern int -vchiq_bulk_xfer_callback(struct vchiq_instance *instance, unsigned int handle, - struct vchiq_bulk *bulk); - -extern void -vchiq_dump_state(struct seq_file *f, struct vchiq_state *state); - -extern void -request_poll(struct vchiq_state *state, struct vchiq_service *service, - int poll_type); - -struct vchiq_service *handle_to_service(struct vchiq_instance *instance, unsigned int handle); - -extern struct vchiq_service * -find_service_by_handle(struct vchiq_instance *instance, unsigned int handle); - -extern struct vchiq_service * -find_service_by_port(struct vchiq_state *state, unsigned int localport); - -extern struct vchiq_service * -find_service_for_instance(struct vchiq_instance *instance, unsigned int handle); - -extern struct vchiq_service * -find_closed_service_for_instance(struct vchiq_instance *instance, unsigned int handle); - -extern struct vchiq_service * -__next_service_by_instance(struct vchiq_state *state, - struct vchiq_instance *instance, - int *pidx); - -extern struct vchiq_service * -next_service_by_instance(struct vchiq_state *state, - struct vchiq_instance *instance, - int *pidx); - -extern void -vchiq_service_get(struct vchiq_service *service); - -extern void -vchiq_service_put(struct vchiq_service *service); - -extern int -vchiq_queue_message(struct vchiq_instance *instance, unsigned int handle, - ssize_t (*copy_callback)(void *context, void *dest, - size_t offset, size_t maxsize), - void *context, - size_t size); - -void vchiq_dump_platform_state(struct seq_file *f); - -void vchiq_dump_platform_instances(struct vchiq_state *state, struct seq_file *f); - -void vchiq_dump_platform_service_state(struct seq_file *f, struct vchiq_service *service); - -int vchiq_use_service_internal(struct vchiq_service *service); - -int vchiq_release_service_internal(struct vchiq_service *service); - -void vchiq_on_remote_use(struct vchiq_state *state); - -void vchiq_on_remote_release(struct vchiq_state *state); - -int vchiq_platform_init_state(struct vchiq_state *state); - -int vchiq_check_service(struct vchiq_service *service); - -int vchiq_send_remote_use(struct vchiq_state *state); - -int vchiq_send_remote_use_active(struct vchiq_state *state); - -void vchiq_platform_conn_state_changed(struct vchiq_state *state, - enum vchiq_connstate oldstate, - enum vchiq_connstate newstate); - -void vchiq_set_conn_state(struct vchiq_state *state, enum vchiq_connstate newstate); - -void vchiq_log_dump_mem(struct device *dev, const char *label, u32 addr, - const void *void_mem, size_t num_bytes); - -int vchiq_remove_service(struct vchiq_instance *instance, unsigned int service); - -int vchiq_get_client_id(struct vchiq_instance *instance, unsigned int service); - -void vchiq_get_config(struct vchiq_config *config); - -int vchiq_set_service_option(struct vchiq_instance *instance, unsigned int service, - enum vchiq_service_option option, int value); - -#endif diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c index d5f7f61c5626..c82326a9b6d9 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c @@ -5,9 +5,9 @@ */ #include -#include "vchiq_core.h" -#include "vchiq_arm.h" -#include "vchiq_debugfs.h" +#include +#include +#include #ifdef CONFIG_DEBUG_FS diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h deleted file mode 100644 index b29e6693c949..000000000000 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. */ - -#ifndef VCHIQ_DEBUGFS_H -#define VCHIQ_DEBUGFS_H - -struct vchiq_state; -struct vchiq_instance; - -struct vchiq_debugfs_node { - struct dentry *dentry; -}; - -void vchiq_debugfs_init(struct vchiq_state *state); - -void vchiq_debugfs_deinit(void); - -void vchiq_debugfs_add_instance(struct vchiq_instance *instance); - -void vchiq_debugfs_remove_instance(struct vchiq_instance *instance); - -#endif /* VCHIQ_DEBUGFS_H */ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c index 3b20ba5c7362..0f3dde2657d6 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c @@ -11,10 +11,11 @@ #include #include -#include "vchiq_core.h" +#include +#include +#include + #include "vchiq_ioctl.h" -#include "vchiq_arm.h" -#include "vchiq_debugfs.h" static const char *const ioctl_names[] = { "CONNECT", diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h index afb71a83cfe7..d0c759f6d8ea 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h @@ -5,8 +5,7 @@ #define VCHIQ_IOCTLS_H #include - -#include "../../include/linux/raspberrypi/vchiq.h" +#include #define VCHIQ_IOC_MAGIC 0xc4 #define VCHIQ_INVALID_HANDLE (~0) diff --git a/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c b/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c index c2b5a37915f2..cd073ed3ea2d 100644 --- a/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c +++ b/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c @@ -22,11 +22,12 @@ #include #include #include +#include #include #include -#include "../include/linux/raspberrypi/vchiq.h" -#include "../interface/vchiq_arm/vchiq_arm.h" +#include + #include "mmal-common.h" #include "mmal-vchiq.h" #include "mmal-msg.h" diff --git a/include/linux/raspberrypi/vchiq.h b/include/linux/raspberrypi/vchiq.h new file mode 100644 index 000000000000..ee4469f4fc51 --- /dev/null +++ b/include/linux/raspberrypi/vchiq.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */ + +#ifndef VCHIQ_H +#define VCHIQ_H + +#define VCHIQ_MAKE_FOURCC(x0, x1, x2, x3) \ + (((x0) << 24) | ((x1) << 16) | ((x2) << 8) | (x3)) + +enum vchiq_reason { + VCHIQ_SERVICE_OPENED, /* service, -, - */ + VCHIQ_SERVICE_CLOSED, /* service, -, - */ + VCHIQ_MESSAGE_AVAILABLE, /* service, header, - */ + VCHIQ_BULK_TRANSMIT_DONE, /* service, -, bulk_userdata */ + VCHIQ_BULK_RECEIVE_DONE, /* service, -, bulk_userdata */ + VCHIQ_BULK_TRANSMIT_ABORTED, /* service, -, bulk_userdata */ + VCHIQ_BULK_RECEIVE_ABORTED /* service, -, bulk_userdata */ +}; + +enum vchiq_bulk_mode { + VCHIQ_BULK_MODE_CALLBACK, + VCHIQ_BULK_MODE_BLOCKING, + VCHIQ_BULK_MODE_NOCALLBACK, + VCHIQ_BULK_MODE_WAITING /* Reserved for internal use */ +}; + +enum vchiq_service_option { + VCHIQ_SERVICE_OPTION_AUTOCLOSE, + VCHIQ_SERVICE_OPTION_SLOT_QUOTA, + VCHIQ_SERVICE_OPTION_MESSAGE_QUOTA, + VCHIQ_SERVICE_OPTION_SYNCHRONOUS, + VCHIQ_SERVICE_OPTION_TRACE +}; + +struct vchiq_header { + /* The message identifier - opaque to applications. */ + int msgid; + + /* Size of message data. */ + unsigned int size; + + char data[]; /* message */ +}; + +struct vchiq_element { + const void __user *data; + unsigned int size; +}; + +struct vchiq_instance; +struct vchiq_state; + +struct vchiq_service_base { + int fourcc; + int (*callback)(struct vchiq_instance *instance, + enum vchiq_reason reason, + struct vchiq_header *header, + unsigned int handle, + void *cb_data, void __user *cb_userdata); + void *userdata; +}; + +struct vchiq_completion_data_kernel { + enum vchiq_reason reason; + struct vchiq_header *header; + void *service_userdata; + void *cb_data; + void __user *cb_userdata; +}; + +struct vchiq_service_params_kernel { + int fourcc; + int (*callback)(struct vchiq_instance *instance, + enum vchiq_reason reason, + struct vchiq_header *header, + unsigned int handle, + void *cb_data, void __user *cb_userdata); + void *userdata; + short version; /* Increment for non-trivial changes */ + short version_min; /* Update for incompatible changes */ +}; + +extern int vchiq_initialise(struct vchiq_state *state, + struct vchiq_instance **pinstance); +extern int vchiq_shutdown(struct vchiq_instance *instance); +extern int vchiq_connect(struct vchiq_instance *instance); +extern int vchiq_open_service(struct vchiq_instance *instance, + const struct vchiq_service_params_kernel *params, + unsigned int *pservice); +extern int vchiq_close_service(struct vchiq_instance *instance, + unsigned int service); +extern int vchiq_use_service(struct vchiq_instance *instance, unsigned int service); +extern int vchiq_release_service(struct vchiq_instance *instance, + unsigned int service); +extern void vchiq_msg_queue_push(struct vchiq_instance *instance, unsigned int handle, + struct vchiq_header *header); +extern void vchiq_release_message(struct vchiq_instance *instance, unsigned int service, + struct vchiq_header *header); +extern int vchiq_queue_kernel_message(struct vchiq_instance *instance, unsigned int handle, + void *data, unsigned int size); +extern int vchiq_bulk_transmit(struct vchiq_instance *instance, unsigned int service, + const void *data, unsigned int size, void *userdata, + enum vchiq_bulk_mode mode); +extern int vchiq_bulk_receive(struct vchiq_instance *instance, unsigned int service, + void *data, unsigned int size, void *userdata, + enum vchiq_bulk_mode mode); +extern void *vchiq_get_service_userdata(struct vchiq_instance *instance, unsigned int service); +extern int vchiq_get_peer_version(struct vchiq_instance *instance, unsigned int handle, + short *peer_version); +extern struct vchiq_header *vchiq_msg_hold(struct vchiq_instance *instance, unsigned int handle); + +#endif /* VCHIQ_H */ diff --git a/include/linux/raspberrypi/vchiq_arm.h b/include/linux/raspberrypi/vchiq_arm.h new file mode 100644 index 000000000000..e32b02f99024 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_arm.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + */ + +#ifndef VCHIQ_ARM_H +#define VCHIQ_ARM_H + +#include +#include +#include +#include +#include "vchiq_core.h" +#include "vchiq_debugfs.h" + +/* Some per-instance constants */ +#define MAX_COMPLETIONS 128 +#define MAX_SERVICES 64 +#define MAX_ELEMENTS 8 +#define MSG_QUEUE_SIZE 128 + +#define VCHIQ_DRV_MAX_CALLBACKS 10 + +struct rpi_firmware; +struct vchiq_device; + +enum USE_TYPE_E { + USE_TYPE_SERVICE, + USE_TYPE_VCHIQ +}; + +struct vchiq_platform_info { + unsigned int cache_line_size; +}; + +struct vchiq_drv_mgmt { + struct rpi_firmware *fw; + const struct vchiq_platform_info *info; + + bool connected; + int num_deferred_callbacks; + /* Protects connected and num_deferred_callbacks */ + struct mutex connected_mutex; + + void (*deferred_callback[VCHIQ_DRV_MAX_CALLBACKS])(void); + + struct semaphore free_fragments_sema; + struct semaphore free_fragments_mutex; + char *fragments_base; + char *free_fragments; + unsigned int fragments_size; + + void __iomem *regs; + + struct vchiq_state state; +}; + +struct user_service { + struct vchiq_service *service; + void __user *userdata; + struct vchiq_instance *instance; + char is_vchi; + char dequeue_pending; + char close_pending; + int message_available_pos; + int msg_insert; + int msg_remove; + struct completion insert_event; + struct completion remove_event; + struct completion close_event; + struct vchiq_header *msg_queue[MSG_QUEUE_SIZE]; +}; + +struct bulk_waiter_node { + struct bulk_waiter bulk_waiter; + int pid; + struct list_head list; +}; + +struct vchiq_instance { + struct vchiq_state *state; + struct vchiq_completion_data_kernel completions[MAX_COMPLETIONS]; + int completion_insert; + int completion_remove; + struct completion insert_event; + struct completion remove_event; + struct mutex completion_mutex; + + int connected; + int closing; + int pid; + int mark; + int use_close_delivered; + int trace; + + struct list_head bulk_waiter_list; + struct mutex bulk_waiter_list_mutex; + + struct vchiq_debugfs_node debugfs_node; +}; + +int +vchiq_use_service(struct vchiq_instance *instance, unsigned int handle); + +extern int +vchiq_release_service(struct vchiq_instance *instance, unsigned int handle); + +extern int +vchiq_check_service(struct vchiq_service *service); + +extern void +vchiq_dump_service_use_state(struct vchiq_state *state); + +extern int +vchiq_use_internal(struct vchiq_state *state, struct vchiq_service *service, + enum USE_TYPE_E use_type); +extern int +vchiq_release_internal(struct vchiq_state *state, + struct vchiq_service *service); + +extern struct vchiq_debugfs_node * +vchiq_instance_get_debugfs_node(struct vchiq_instance *instance); + +extern int +vchiq_instance_get_use_count(struct vchiq_instance *instance); + +extern int +vchiq_instance_get_pid(struct vchiq_instance *instance); + +extern int +vchiq_instance_get_trace(struct vchiq_instance *instance); + +extern void +vchiq_instance_set_trace(struct vchiq_instance *instance, int trace); + +extern void +vchiq_add_connected_callback(struct vchiq_device *device, + void (*callback)(void)); + +#if IS_ENABLED(CONFIG_VCHIQ_CDEV) + +extern void +vchiq_deregister_chrdev(void); + +extern int +vchiq_register_chrdev(struct device *parent); + +#else + +static inline void vchiq_deregister_chrdev(void) { } +static inline int vchiq_register_chrdev(struct device *parent) { return 0; } + +#endif /* IS_ENABLED(CONFIG_VCHIQ_CDEV) */ + +extern int +service_callback(struct vchiq_instance *vchiq_instance, enum vchiq_reason reason, + struct vchiq_header *header, unsigned int handle, + void *cb_data, void __user *cb_userdata); + +extern void +free_bulk_waiter(struct vchiq_instance *instance); + +#endif /* VCHIQ_ARM_H */ diff --git a/include/linux/raspberrypi/vchiq_bus.h b/include/linux/raspberrypi/vchiq_bus.h new file mode 100644 index 000000000000..9de179b39f85 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_bus.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Ideas On Board Oy + */ + +#ifndef _VCHIQ_DEVICE_H +#define _VCHIQ_DEVICE_H + +#include +#include + +struct vchiq_drv_mgmt; + +struct vchiq_device { + struct device dev; + struct vchiq_drv_mgmt *drv_mgmt; +}; + +struct vchiq_driver { + int (*probe)(struct vchiq_device *device); + void (*remove)(struct vchiq_device *device); + int (*resume)(struct vchiq_device *device); + int (*suspend)(struct vchiq_device *device, + pm_message_t state); + + const struct vchiq_device_id *id_table; + struct device_driver driver; +}; + +static inline struct vchiq_device *to_vchiq_device(struct device *d) +{ + return container_of(d, struct vchiq_device, dev); +} + +static inline struct vchiq_driver *to_vchiq_driver(struct device_driver *d) +{ + return container_of(d, struct vchiq_driver, driver); +} + +extern const struct bus_type vchiq_bus_type; + +struct vchiq_device * +vchiq_device_register(struct device *parent, const char *name); +void vchiq_device_unregister(struct vchiq_device *dev); + +int vchiq_driver_register(struct vchiq_driver *vchiq_drv); +void vchiq_driver_unregister(struct vchiq_driver *vchiq_drv); + +/** + * module_vchiq_driver() - Helper macro for registering a vchiq driver + * @__vchiq_driver: vchiq driver struct + * + * Helper macro for vchiq drivers which do not do anything special in + * module init/exit. This eliminates a lot of boilerplate. Each module may only + * use this macro once, and calling it replaces module_init() and module_exit() + */ +#define module_vchiq_driver(__vchiq_driver) \ + module_driver(__vchiq_driver, vchiq_driver_register, vchiq_driver_unregister) + +#endif /* _VCHIQ_DEVICE_H */ diff --git a/include/linux/raspberrypi/vchiq_cfg.h b/include/linux/raspberrypi/vchiq_cfg.h new file mode 100644 index 000000000000..a16d0299996c --- /dev/null +++ b/include/linux/raspberrypi/vchiq_cfg.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2010-2014 Broadcom. All rights reserved. */ + +#ifndef VCHIQ_CFG_H +#define VCHIQ_CFG_H + +#define VCHIQ_MAGIC VCHIQ_MAKE_FOURCC('V', 'C', 'H', 'I') +/* The version of VCHIQ - change with any non-trivial change */ +#define VCHIQ_VERSION 8 +/* + * The minimum compatible version - update to match VCHIQ_VERSION with any + * incompatible change + */ +#define VCHIQ_VERSION_MIN 3 + +/* The version that introduced the VCHIQ_IOC_LIB_VERSION ioctl */ +#define VCHIQ_VERSION_LIB_VERSION 7 + +/* The version that introduced the VCHIQ_IOC_CLOSE_DELIVERED ioctl */ +#define VCHIQ_VERSION_CLOSE_DELIVERED 7 + +/* The version that made it safe to use SYNCHRONOUS mode */ +#define VCHIQ_VERSION_SYNCHRONOUS_MODE 8 + +#define VCHIQ_MAX_STATES 1 +#define VCHIQ_MAX_SERVICES 4096 +#define VCHIQ_MAX_SLOTS 128 +#define VCHIQ_MAX_SLOTS_PER_SIDE 64 + +#define VCHIQ_NUM_CURRENT_BULKS 32 +#define VCHIQ_NUM_SERVICE_BULKS 4 + +#ifndef VCHIQ_ENABLE_DEBUG +#define VCHIQ_ENABLE_DEBUG 1 +#endif + +#ifndef VCHIQ_ENABLE_STATS +#define VCHIQ_ENABLE_STATS 1 +#endif + +#endif /* VCHIQ_CFG_H */ diff --git a/include/linux/raspberrypi/vchiq_core.h b/include/linux/raspberrypi/vchiq_core.h new file mode 100644 index 000000000000..e7bf7a114985 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_core.h @@ -0,0 +1,646 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */ + +#ifndef VCHIQ_CORE_H +#define VCHIQ_CORE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vchiq.h" +#include "vchiq_cfg.h" + +/* Do this so that we can test-build the code on non-rpi systems */ +#if IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) + +#else + +#ifndef dsb +#define dsb(a) +#endif + +#endif /* IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) */ + +#define VCHIQ_SERVICE_HANDLE_INVALID 0 + +#define VCHIQ_SLOT_SIZE 4096 +#define VCHIQ_MAX_MSG_SIZE (VCHIQ_SLOT_SIZE - sizeof(struct vchiq_header)) + +#define VCHIQ_SLOT_MASK (VCHIQ_SLOT_SIZE - 1) +#define VCHIQ_SLOT_QUEUE_MASK (VCHIQ_MAX_SLOTS_PER_SIDE - 1) +#define VCHIQ_SLOT_ZERO_SLOTS DIV_ROUND_UP(sizeof(struct vchiq_slot_zero), \ + VCHIQ_SLOT_SIZE) + +#define BITSET_SIZE(b) ((b + 31) >> 5) +#define BITSET_WORD(b) (b >> 5) +#define BITSET_BIT(b) (1 << (b & 31)) +#define BITSET_IS_SET(bs, b) (bs[BITSET_WORD(b)] & BITSET_BIT(b)) +#define BITSET_SET(bs, b) (bs[BITSET_WORD(b)] |= BITSET_BIT(b)) + +enum { + DEBUG_ENTRIES, +#if VCHIQ_ENABLE_DEBUG + DEBUG_SLOT_HANDLER_COUNT, + DEBUG_SLOT_HANDLER_LINE, + DEBUG_PARSE_LINE, + DEBUG_PARSE_HEADER, + DEBUG_PARSE_MSGID, + DEBUG_AWAIT_COMPLETION_LINE, + DEBUG_DEQUEUE_MESSAGE_LINE, + DEBUG_SERVICE_CALLBACK_LINE, + DEBUG_MSG_QUEUE_FULL_COUNT, + DEBUG_COMPLETION_QUEUE_FULL_COUNT, +#endif + DEBUG_MAX +}; + +#if VCHIQ_ENABLE_DEBUG + +#define DEBUG_INITIALISE(local) int *debug_ptr = (local)->debug +#define DEBUG_TRACE(d) \ + do { debug_ptr[DEBUG_ ## d] = __LINE__; dsb(sy); } while (0) +#define DEBUG_VALUE(d, v) \ + do { debug_ptr[DEBUG_ ## d] = (v); dsb(sy); } while (0) +#define DEBUG_COUNT(d) \ + do { debug_ptr[DEBUG_ ## d]++; dsb(sy); } while (0) + +#else /* VCHIQ_ENABLE_DEBUG */ + +#define DEBUG_INITIALISE(local) +#define DEBUG_TRACE(d) +#define DEBUG_VALUE(d, v) +#define DEBUG_COUNT(d) + +#endif /* VCHIQ_ENABLE_DEBUG */ + +enum vchiq_connstate { + VCHIQ_CONNSTATE_DISCONNECTED, + VCHIQ_CONNSTATE_CONNECTING, + VCHIQ_CONNSTATE_CONNECTED, + VCHIQ_CONNSTATE_PAUSING, + VCHIQ_CONNSTATE_PAUSE_SENT, + VCHIQ_CONNSTATE_PAUSED, + VCHIQ_CONNSTATE_RESUMING, + VCHIQ_CONNSTATE_PAUSE_TIMEOUT, + VCHIQ_CONNSTATE_RESUME_TIMEOUT +}; + +enum { + VCHIQ_SRVSTATE_FREE, + VCHIQ_SRVSTATE_HIDDEN, + VCHIQ_SRVSTATE_LISTENING, + VCHIQ_SRVSTATE_OPENING, + VCHIQ_SRVSTATE_OPEN, + VCHIQ_SRVSTATE_OPENSYNC, + VCHIQ_SRVSTATE_CLOSESENT, + VCHIQ_SRVSTATE_CLOSERECVD, + VCHIQ_SRVSTATE_CLOSEWAIT, + VCHIQ_SRVSTATE_CLOSED +}; + +enum vchiq_bulk_dir { + VCHIQ_BULK_TRANSMIT, + VCHIQ_BULK_RECEIVE +}; + +struct vchiq_bulk { + short mode; + short dir; + void *cb_data; + void __user *cb_userdata; + struct bulk_waiter *waiter; + dma_addr_t dma_addr; + int size; + void *remote_data; + int remote_size; + int actual; + void *offset; + void __user *uoffset; +}; + +struct vchiq_bulk_queue { + int local_insert; /* Where to insert the next local bulk */ + int remote_insert; /* Where to insert the next remote bulk (master) */ + int process; /* Bulk to transfer next */ + int remote_notify; /* Bulk to notify the remote client of next (mstr) */ + int remove; /* Bulk to notify the local client of, and remove, next */ + struct vchiq_bulk bulks[VCHIQ_NUM_SERVICE_BULKS]; +}; + +/* + * Remote events provide a way of presenting several virtual doorbells to a + * peer (ARM host to VPU) using only one physical doorbell. They can be thought + * of as a way for the peer to signal a semaphore, in this case implemented as + * a workqueue. + * + * Remote events remain signalled until acknowledged by the receiver, and they + * are non-counting. They are designed in such a way as to minimise the number + * of interrupts and avoid unnecessary waiting. + * + * A remote_event is as small data structures that live in shared memory. It + * comprises two booleans - armed and fired: + * + * The sender sets fired when they signal the receiver. + * If fired is set, the receiver has been signalled and need not wait. + * The receiver sets the armed field before they begin to wait. + * If armed is set, the receiver is waiting and wishes to be woken by interrupt. + */ +struct remote_event { + int armed; + int fired; + u32 __unused; +}; + +struct opaque_platform_state; + +struct vchiq_slot { + char data[VCHIQ_SLOT_SIZE]; +}; + +struct vchiq_slot_info { + /* Use two counters rather than one to avoid the need for a mutex. */ + short use_count; + short release_count; +}; + +/* + * VCHIQ is a reliable connection-oriented datagram protocol. + * + * A VCHIQ service is equivalent to a TCP connection, except: + * + FOURCCs are used for the rendezvous, and port numbers are assigned at the + * time the connection is established. + * + There is less of a distinction between server and client sockets, the only + * difference being which end makes the first move. + * + For a multi-client server, the server creates new "listening" services as + * the existing one becomes connected - there is no need to specify the + * maximum number of clients up front. + * + Data transfer is reliable but packetized (messages have defined ends). + * + Messages can be either short (capable of fitting in a slot) and in-band, + * or copied between external buffers (bulk transfers). + */ +struct vchiq_service { + struct vchiq_service_base base; + unsigned int handle; + struct kref ref_count; + struct rcu_head rcu; + int srvstate; + void (*userdata_term)(void *userdata); + unsigned int localport; + unsigned int remoteport; + int public_fourcc; + int client_id; + char auto_close; + char sync; + char closing; + char trace; + atomic_t poll_flags; + short version; + short version_min; + short peer_version; + + struct vchiq_state *state; + struct vchiq_instance *instance; + + int service_use_count; + + struct vchiq_bulk_queue bulk_tx; + struct vchiq_bulk_queue bulk_rx; + + struct completion remove_event; + struct completion bulk_remove_event; + struct mutex bulk_mutex; + + struct service_stats_struct { + int quota_stalls; + int slot_stalls; + int bulk_stalls; + int error_count; + int ctrl_tx_count; + int ctrl_rx_count; + int bulk_tx_count; + int bulk_rx_count; + int bulk_aborted_count; + u64 ctrl_tx_bytes; + u64 ctrl_rx_bytes; + u64 bulk_tx_bytes; + u64 bulk_rx_bytes; + } stats; + + int msg_queue_read; + int msg_queue_write; + struct completion msg_queue_pop; + struct completion msg_queue_push; + struct vchiq_header *msg_queue[VCHIQ_MAX_SLOTS]; +}; + +/* + * The quota information is outside struct vchiq_service so that it can + * be statically allocated, since for accounting reasons a service's slot + * usage is carried over between users of the same port number. + */ +struct vchiq_service_quota { + unsigned short slot_quota; + unsigned short slot_use_count; + unsigned short message_quota; + unsigned short message_use_count; + struct completion quota_event; + int previous_tx_index; +}; + +struct vchiq_shared_state { + /* A non-zero value here indicates that the content is valid. */ + int initialised; + + /* The first and last (inclusive) slots allocated to the owner. */ + int slot_first; + int slot_last; + + /* The slot allocated to synchronous messages from the owner. */ + int slot_sync; + + /* + * Signalling this event indicates that owner's slot handler thread + * should run. + */ + struct remote_event trigger; + + /* + * Indicates the byte position within the stream where the next message + * will be written. The least significant bits are an index into the + * slot. The next bits are the index of the slot in slot_queue. + */ + int tx_pos; + + /* This event should be signalled when a slot is recycled. */ + struct remote_event recycle; + + /* The slot_queue index where the next recycled slot will be written. */ + int slot_queue_recycle; + + /* This event should be signalled when a synchronous message is sent. */ + struct remote_event sync_trigger; + + /* + * This event should be signalled when a synchronous message has been + * released. + */ + struct remote_event sync_release; + + /* A circular buffer of slot indexes. */ + int slot_queue[VCHIQ_MAX_SLOTS_PER_SIDE]; + + /* Debugging state */ + int debug[DEBUG_MAX]; +}; + +/* + * vchiq_slot_zero describes the memory shared between the ARM host and the + * VideoCore VPU. The "master" and "slave" states are owned by the respective + * sides but visible to the other; the slots are shared, and the remaining + * fields are read-only. + * + * In the configuration used by this implementation, the memory is allocated + * by the host, the VPU is the master (the side which controls the DMA for bulk + * transfers), and the host is the slave. + * + * The ownership of slots changes with use: + * + When empty they are owned by the sender. + * + When partially filled they are shared with the receiver. + * + When completely full they are owned by the receiver. + * + When the receiver has finished processing the contents, they are recycled + * back to the sender. + */ +struct vchiq_slot_zero { + int magic; + short version; + short version_min; + int slot_zero_size; + int slot_size; + int max_slots; + int max_slots_per_side; + int platform_data[2]; + struct vchiq_shared_state master; + struct vchiq_shared_state slave; + struct vchiq_slot_info slots[VCHIQ_MAX_SLOTS]; +}; + +/* + * This is the private runtime state used by each side. The same structure was + * originally used by both sides, but implementations have since diverged. + */ +struct vchiq_state { + struct device *dev; + int id; + int initialised; + enum vchiq_connstate conn_state; + short version_common; + + struct vchiq_shared_state *local; + struct vchiq_shared_state *remote; + struct vchiq_slot *slot_data; + + unsigned short default_slot_quota; + unsigned short default_message_quota; + + /* Event indicating connect message received */ + struct completion connect; + + /* Mutex protecting services */ + struct mutex mutex; + struct vchiq_instance **instance; + + /* Processes all incoming messages which aren't synchronous */ + struct task_struct *slot_handler_thread; + + /* + * Slots which have been fully processed and released by the (peer) + * receiver are added to the receiver queue, which is asynchronously + * processed by the recycle thread. + */ + struct task_struct *recycle_thread; + + /* + * Processes incoming synchronous messages + * + * The synchronous message channel is shared between all synchronous + * services, and provides a way for urgent messages to bypass + * potentially long queues of asynchronous messages in the normal slots. + * + * There can be only one outstanding synchronous message in + * each direction, and as a precious shared resource synchronous + * services should be used sparingly. + */ + struct task_struct *sync_thread; + + /* Local implementation of the trigger remote event */ + wait_queue_head_t trigger_event; + + /* Local implementation of the recycle remote event */ + wait_queue_head_t recycle_event; + + /* Local implementation of the sync trigger remote event */ + wait_queue_head_t sync_trigger_event; + + /* Local implementation of the sync release remote event */ + wait_queue_head_t sync_release_event; + + char *tx_data; + char *rx_data; + struct vchiq_slot_info *rx_info; + + struct mutex slot_mutex; + + struct mutex recycle_mutex; + + struct mutex sync_mutex; + + spinlock_t msg_queue_spinlock; + + spinlock_t bulk_waiter_spinlock; + + spinlock_t quota_spinlock; + + /* + * Indicates the byte position within the stream from where the next + * message will be read. The least significant bits are an index into + * the slot.The next bits are the index of the slot in + * remote->slot_queue. + */ + int rx_pos; + + /* + * A cached copy of local->tx_pos. Only write to local->tx_pos, and read + * from remote->tx_pos. + */ + int local_tx_pos; + + /* The slot_queue index of the slot to become available next. */ + int slot_queue_available; + + /* A flag to indicate if any poll has been requested */ + int poll_needed; + + /* Ths index of the previous slot used for data messages. */ + int previous_data_index; + + /* The number of slots occupied by data messages. */ + unsigned short data_use_count; + + /* The maximum number of slots to be occupied by data messages. */ + unsigned short data_quota; + + /* An array of bit sets indicating which services must be polled. */ + atomic_t poll_services[BITSET_SIZE(VCHIQ_MAX_SERVICES)]; + + /* The number of the first unused service */ + int unused_service; + + /* Signalled when a free slot becomes available. */ + struct completion slot_available_event; + + /* Signalled when a free data slot becomes available. */ + struct completion data_quota_event; + + struct state_stats_struct { + int slot_stalls; + int data_stalls; + int ctrl_tx_count; + int ctrl_rx_count; + int error_count; + } stats; + + struct vchiq_service __rcu *services[VCHIQ_MAX_SERVICES]; + struct vchiq_service_quota service_quotas[VCHIQ_MAX_SERVICES]; + struct vchiq_slot_info slot_info[VCHIQ_MAX_SLOTS]; + + struct opaque_platform_state *platform_state; +}; + +struct pagelist { + u32 length; + u16 type; + u16 offset; + u32 addrs[1]; /* N.B. 12 LSBs hold the number + * of following pages at consecutive + * addresses. + */ +}; + +struct vchiq_pagelist_info { + struct pagelist *pagelist; + size_t pagelist_buffer_size; + dma_addr_t dma_addr; + enum dma_data_direction dma_dir; + unsigned int num_pages; + unsigned int pages_need_release; + struct page **pages; + struct scatterlist *scatterlist; + unsigned int scatterlist_mapped; +}; + +static inline bool vchiq_remote_initialised(const struct vchiq_state *state) +{ + return state->remote && state->remote->initialised; +} + +struct bulk_waiter { + struct vchiq_bulk *bulk; + struct completion event; + int actual; +}; + +struct vchiq_config { + unsigned int max_msg_size; + unsigned int bulk_threshold; /* The message size above which it + * is better to use a bulk transfer + * (<= max_msg_size) + */ + unsigned int max_outstanding_bulks; + unsigned int max_services; + short version; /* The version of VCHIQ */ + short version_min; /* The minimum compatible version of VCHIQ */ +}; + +extern spinlock_t bulk_waiter_spinlock; + +extern const char * +get_conn_state_name(enum vchiq_connstate conn_state); + +extern struct vchiq_slot_zero * +vchiq_init_slots(struct device *dev, void *mem_base, int mem_size); + +extern int +vchiq_init_state(struct vchiq_state *state, struct vchiq_slot_zero *slot_zero, struct device *dev); + +extern int +vchiq_connect_internal(struct vchiq_state *state, struct vchiq_instance *instance); + +struct vchiq_service * +vchiq_add_service_internal(struct vchiq_state *state, + const struct vchiq_service_params_kernel *params, + int srvstate, struct vchiq_instance *instance, + void (*userdata_term)(void *userdata)); + +extern int +vchiq_open_service_internal(struct vchiq_service *service, int client_id); + +extern int +vchiq_close_service_internal(struct vchiq_service *service, int close_recvd); + +extern void +vchiq_terminate_service_internal(struct vchiq_service *service); + +extern void +vchiq_free_service_internal(struct vchiq_service *service); + +extern void +vchiq_shutdown_internal(struct vchiq_state *state, struct vchiq_instance *instance); + +extern void +remote_event_pollall(struct vchiq_state *state); + +extern int +vchiq_bulk_xfer_waiting(struct vchiq_instance *instance, unsigned int handle, + struct bulk_waiter *userdata); + +extern int +vchiq_bulk_xfer_blocking(struct vchiq_instance *instance, unsigned int handle, + struct vchiq_bulk *bulk); + +extern int +vchiq_bulk_xfer_callback(struct vchiq_instance *instance, unsigned int handle, + struct vchiq_bulk *bulk); + +extern void +vchiq_dump_state(struct seq_file *f, struct vchiq_state *state); + +extern void +request_poll(struct vchiq_state *state, struct vchiq_service *service, + int poll_type); + +struct vchiq_service *handle_to_service(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +find_service_by_handle(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +find_service_by_port(struct vchiq_state *state, unsigned int localport); + +extern struct vchiq_service * +find_service_for_instance(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +find_closed_service_for_instance(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +__next_service_by_instance(struct vchiq_state *state, + struct vchiq_instance *instance, + int *pidx); + +extern struct vchiq_service * +next_service_by_instance(struct vchiq_state *state, + struct vchiq_instance *instance, + int *pidx); + +extern void +vchiq_service_get(struct vchiq_service *service); + +extern void +vchiq_service_put(struct vchiq_service *service); + +extern int +vchiq_queue_message(struct vchiq_instance *instance, unsigned int handle, + ssize_t (*copy_callback)(void *context, void *dest, + size_t offset, size_t maxsize), + void *context, + size_t size); + +void vchiq_dump_platform_state(struct seq_file *f); + +void vchiq_dump_platform_instances(struct vchiq_state *state, struct seq_file *f); + +void vchiq_dump_platform_service_state(struct seq_file *f, struct vchiq_service *service); + +int vchiq_use_service_internal(struct vchiq_service *service); + +int vchiq_release_service_internal(struct vchiq_service *service); + +void vchiq_on_remote_use(struct vchiq_state *state); + +void vchiq_on_remote_release(struct vchiq_state *state); + +int vchiq_platform_init_state(struct vchiq_state *state); + +int vchiq_check_service(struct vchiq_service *service); + +int vchiq_send_remote_use(struct vchiq_state *state); + +int vchiq_send_remote_use_active(struct vchiq_state *state); + +void vchiq_platform_conn_state_changed(struct vchiq_state *state, + enum vchiq_connstate oldstate, + enum vchiq_connstate newstate); + +void vchiq_set_conn_state(struct vchiq_state *state, enum vchiq_connstate newstate); + +void vchiq_log_dump_mem(struct device *dev, const char *label, u32 addr, + const void *void_mem, size_t num_bytes); + +int vchiq_remove_service(struct vchiq_instance *instance, unsigned int service); + +int vchiq_get_client_id(struct vchiq_instance *instance, unsigned int service); + +void vchiq_get_config(struct vchiq_config *config); + +int vchiq_set_service_option(struct vchiq_instance *instance, unsigned int service, + enum vchiq_service_option option, int value); + +#endif diff --git a/include/linux/raspberrypi/vchiq_debugfs.h b/include/linux/raspberrypi/vchiq_debugfs.h new file mode 100644 index 000000000000..b29e6693c949 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_debugfs.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. */ + +#ifndef VCHIQ_DEBUGFS_H +#define VCHIQ_DEBUGFS_H + +struct vchiq_state; +struct vchiq_instance; + +struct vchiq_debugfs_node { + struct dentry *dentry; +}; + +void vchiq_debugfs_init(struct vchiq_state *state); + +void vchiq_debugfs_deinit(void); + +void vchiq_debugfs_add_instance(struct vchiq_instance *instance); + +void vchiq_debugfs_remove_instance(struct vchiq_instance *instance); + +#endif /* VCHIQ_DEBUGFS_H */ -- cgit v1.2.3 From 7b8a8ec20cfce2298f6737089f5d17407ea346b4 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 27 Oct 2025 11:34:01 +0200 Subject: PCI/TPH: Expose pcie_tph_get_st_table_loc() Expose pcie_tph_get_st_table_loc() to be used by drivers as will be done in the next patch from the series. Signed-off-by: Yishai Hadas Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251027-st-direct-mode-v1-1-e0ad953866b6@nvidia.com Acked-by: Bjorn Helgaas Signed-off-by: Leon Romanovsky --- drivers/pci/tph.c | 16 +++++++++++++--- include/linux/pci-tph.h | 1 + 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c index cc64f93709a4..ca4f97be7538 100644 --- a/drivers/pci/tph.c +++ b/drivers/pci/tph.c @@ -155,7 +155,16 @@ static u8 get_st_modes(struct pci_dev *pdev) return reg; } -static u32 get_st_table_loc(struct pci_dev *pdev) +/** + * pcie_tph_get_st_table_loc - Return the device's ST table location + * @pdev: PCI device to query + * + * Return: + * PCI_TPH_LOC_NONE - Not present + * PCI_TPH_LOC_CAP - Located in the TPH Requester Extended Capability + * PCI_TPH_LOC_MSIX - Located in the MSI-X Table + */ +u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev) { u32 reg; @@ -163,6 +172,7 @@ static u32 get_st_table_loc(struct pci_dev *pdev) return FIELD_GET(PCI_TPH_CAP_LOC_MASK, reg); } +EXPORT_SYMBOL(pcie_tph_get_st_table_loc); /* * Return the size of ST table. If ST table is not in TPH Requester Extended @@ -174,7 +184,7 @@ u16 pcie_tph_get_st_table_size(struct pci_dev *pdev) u32 loc; /* Check ST table location first */ - loc = get_st_table_loc(pdev); + loc = pcie_tph_get_st_table_loc(pdev); /* Convert loc to match with PCI_TPH_LOC_* defined in pci_regs.h */ loc = FIELD_PREP(PCI_TPH_CAP_LOC_MASK, loc); @@ -299,7 +309,7 @@ int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag) */ set_ctrl_reg_req_en(pdev, PCI_TPH_REQ_DISABLE); - loc = get_st_table_loc(pdev); + loc = pcie_tph_get_st_table_loc(pdev); /* Convert loc to match with PCI_TPH_LOC_* */ loc = FIELD_PREP(PCI_TPH_CAP_LOC_MASK, loc); diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h index 9e4e331b1603..ba28140ce670 100644 --- a/include/linux/pci-tph.h +++ b/include/linux/pci-tph.h @@ -29,6 +29,7 @@ int pcie_tph_get_cpu_st(struct pci_dev *dev, void pcie_disable_tph(struct pci_dev *pdev); int pcie_enable_tph(struct pci_dev *pdev, int mode); u16 pcie_tph_get_st_table_size(struct pci_dev *pdev); +u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev); #else static inline int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag) -- cgit v1.2.3 From 6948417b3f1fafbeab85c051f8dba5e305a8f9c4 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 29 Oct 2025 17:42:53 +0200 Subject: net/mlx5: Add OTHER_ESWITCH HW capabilities Add OTHER_ESWITCH capabilities which includes other_eswitch and eswitch_owner_vhca_id to all steering objects. Signed-off-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-1-98bb707b5d57@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 47 ++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 07614cd95bed..9b8f88987d2f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5251,13 +5251,15 @@ struct mlx5_ifc_set_fte_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -8809,13 +8811,15 @@ struct mlx5_ifc_destroy_flow_table_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -8840,13 +8844,15 @@ struct mlx5_ifc_destroy_flow_group_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -8985,13 +8991,15 @@ struct mlx5_ifc_delete_fte_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -9535,13 +9543,15 @@ struct mlx5_ifc_create_flow_table_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x20]; @@ -9580,7 +9590,8 @@ struct mlx5_ifc_create_flow_group_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; @@ -9588,7 +9599,7 @@ struct mlx5_ifc_create_flow_group_in_bits { u8 table_type[0x8]; u8 reserved_at_88[0x4]; u8 group_type[0x4]; - u8 reserved_at_90[0x10]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -11876,10 +11887,12 @@ struct mlx5_ifc_set_flow_table_root_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; - u8 reserved_at_60[0x20]; + u8 reserved_at_60[0x10]; + u8 eswitch_owner_vhca_id[0x10]; u8 table_type[0x8]; u8 reserved_at_88[0x7]; @@ -11919,14 +11932,16 @@ struct mlx5_ifc_modify_flow_table_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x10]; u8 modify_field_select[0x10]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; -- cgit v1.2.3 From 3b848dec7e821bace785b9e405bf1884c077635a Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 29 Oct 2025 17:42:54 +0200 Subject: net/mlx5: fs, Add other_eswitch support for steering tables Add other_eswitch support which allows flow tables creation above vports that reside on different esw managers. The new flag MLX5_FLOW_TABLE_OTHER_ESWITCH indicates if the esw_owner_vhca_id attribute is supported. Note that this is only supported if the Advanced-RDMA cap- rdma_transport_manager_other_eswitch is set. And it is the caller responsibility to check that. Signed-off-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-2-98bb707b5d57@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 31 +++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 18 ++++++------- drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 1 + include/linux/mlx5/fs.h | 2 ++ 4 files changed, 42 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 1af76da8b132..ced747bef641 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -239,6 +239,10 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns, MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport); MLX5_SET(set_flow_table_root_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(set_flow_table_root_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(set_flow_table_root_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); err = mlx5_cmd_exec_in(dev, set_flow_table_root, in); if (!err && @@ -302,6 +306,10 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns, MLX5_SET(create_flow_table_in, in, vport_number, ft->vport); MLX5_SET(create_flow_table_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(create_flow_table_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(create_flow_table_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en, en_decap); @@ -360,6 +368,10 @@ static int mlx5_cmd_destroy_flow_table(struct mlx5_flow_root_namespace *ns, MLX5_SET(destroy_flow_table_in, in, vport_number, ft->vport); MLX5_SET(destroy_flow_table_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(destroy_flow_table_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(destroy_flow_table_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); err = mlx5_cmd_exec_in(dev, destroy_flow_table, in); if (!err) @@ -394,6 +406,10 @@ static int mlx5_cmd_modify_flow_table(struct mlx5_flow_root_namespace *ns, MLX5_SET(modify_flow_table_in, in, vport_number, ft->vport); MLX5_SET(modify_flow_table_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(modify_flow_table_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(modify_flow_table_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); MLX5_SET(modify_flow_table_in, in, modify_field_select, MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID); if (next_ft) { @@ -429,6 +445,10 @@ static int mlx5_cmd_create_flow_group(struct mlx5_flow_root_namespace *ns, MLX5_SET(create_flow_group_in, in, vport_number, ft->vport); MLX5_SET(create_flow_group_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(create_flow_group_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(create_flow_group_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); err = mlx5_cmd_exec_inout(dev, create_flow_group, in, out); if (!err) fg->id = MLX5_GET(create_flow_group_out, out, @@ -451,6 +471,10 @@ static int mlx5_cmd_destroy_flow_group(struct mlx5_flow_root_namespace *ns, MLX5_SET(destroy_flow_group_in, in, vport_number, ft->vport); MLX5_SET(destroy_flow_group_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(destroy_flow_group_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(destroy_flow_group_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); return mlx5_cmd_exec_in(dev, destroy_flow_group, in); } @@ -559,6 +583,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, MLX5_SET(set_fte_in, in, vport_number, ft->vport); MLX5_SET(set_fte_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(set_fte_in, in, eswitch_owner_vhca_id, ft->esw_owner_vhca_id); + MLX5_SET(set_fte_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context); MLX5_SET(flow_context, in_flow_context, group_id, group_id); @@ -788,6 +815,10 @@ static int mlx5_cmd_delete_fte(struct mlx5_flow_root_namespace *ns, MLX5_SET(delete_fte_in, in, vport_number, ft->vport); MLX5_SET(delete_fte_in, in, other_vport, !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(delete_fte_in, in, eswitch_owner_vhca_id, + ft->esw_owner_vhca_id); + MLX5_SET(delete_fte_in, in, other_eswitch, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_ESWITCH)); return mlx5_cmd_exec_in(dev, delete_fte, in); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2db3ffb0a2b2..87e381c82ed3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -939,10 +939,10 @@ static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *f return fg; } -static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, - enum fs_flow_table_type table_type, - enum fs_flow_table_op_mod op_mod, - u32 flags) +static struct mlx5_flow_table * +alloc_flow_table(struct mlx5_flow_table_attr *ft_attr, u16 vport, + enum fs_flow_table_type table_type, + enum fs_flow_table_op_mod op_mod) { struct mlx5_flow_table *ft; int ret; @@ -957,12 +957,13 @@ static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, return ERR_PTR(ret); } - ft->level = level; + ft->level = ft_attr->level; ft->node.type = FS_TYPE_FLOW_TABLE; ft->op_mod = op_mod; ft->type = table_type; ft->vport = vport; - ft->flags = flags; + ft->esw_owner_vhca_id = ft_attr->esw_owner_vhca_id; + ft->flags = ft_attr->flags; INIT_LIST_HEAD(&ft->fwd_rules); mutex_init(&ft->lock); @@ -1370,10 +1371,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa /* The level is related to the * priority level range. */ - ft = alloc_flow_table(ft_attr->level, - vport, - root->table_type, - op_mod, ft_attr->flags); + ft = alloc_flow_table(ft_attr, vport, root->table_type, op_mod); if (IS_ERR(ft)) { err = PTR_ERR(ft); goto unlock_root; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 8458ce203dac..0a9a5ef34c21 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -205,6 +205,7 @@ struct mlx5_flow_table { }; u32 id; u16 vport; + u16 esw_owner_vhca_id; unsigned int max_fte; unsigned int level; enum fs_flow_table_type type; diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 6ac76a0c3827..6325a7fa0df2 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -71,6 +71,7 @@ enum { MLX5_FLOW_TABLE_UNMANAGED = BIT(3), MLX5_FLOW_TABLE_OTHER_VPORT = BIT(4), MLX5_FLOW_TABLE_UPLINK_VPORT = BIT(5), + MLX5_FLOW_TABLE_OTHER_ESWITCH = BIT(6), }; #define LEFTOVERS_RULE_NUM 2 @@ -208,6 +209,7 @@ struct mlx5_flow_table_attr { u32 flags; u16 uid; u16 vport; + u16 esw_owner_vhca_id; struct mlx5_flow_table *next_ft; struct { -- cgit v1.2.3 From 583b4fe1c19d978bb787e0adf9ce469cb7f68455 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 29 Oct 2025 17:42:55 +0200 Subject: net/mlx5: fs, set non default device per namespace Add mlx5_fs_set_root_dev() function which swaps the root namespace core device with another one for a given table_type. It is intended for usage only by RDMA_TRANSPORT tables in case of LAG configuration, to allow the creation of tables during LAG always through the LAG master device, which is valid since during LAG the master is allowed to manage the RDMA_TRANSPORT tables of its slaves. In addition move the table_type enum to global include to allow its use in a downstream patch in the RDMA driver. Signed-off-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-3-98bb707b5d57@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 56 +++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 18 -------- include/linux/mlx5/fs.h | 22 +++++++++ 3 files changed, 78 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 87e381c82ed3..5b210c54a592 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -3308,6 +3308,62 @@ err: return ret; } +static bool mlx5_fs_ns_is_empty(struct mlx5_flow_namespace *ns) +{ + struct fs_prio *iter_prio; + + fs_for_each_prio(iter_prio, ns) { + if (iter_prio->num_ft) + return false; + } + + return true; +} + +int mlx5_fs_set_root_dev(struct mlx5_core_dev *dev, + struct mlx5_core_dev *new_dev, + enum fs_flow_table_type table_type) +{ + struct mlx5_flow_root_namespace **root; + int total_vports; + int i; + + switch (table_type) { + case FS_FT_RDMA_TRANSPORT_TX: + root = dev->priv.steering->rdma_transport_tx_root_ns; + total_vports = dev->priv.steering->rdma_transport_tx_vports; + break; + case FS_FT_RDMA_TRANSPORT_RX: + root = dev->priv.steering->rdma_transport_rx_root_ns; + total_vports = dev->priv.steering->rdma_transport_rx_vports; + break; + default: + WARN_ON_ONCE(true); + return -EINVAL; + } + + for (i = 0; i < total_vports; i++) { + mutex_lock(&root[i]->chain_lock); + if (!mlx5_fs_ns_is_empty(&root[i]->ns)) { + mutex_unlock(&root[i]->chain_lock); + goto err; + } + root[i]->dev = new_dev; + mutex_unlock(&root[i]->chain_lock); + } + return 0; +err: + while (i--) { + mutex_lock(&root[i]->chain_lock); + root[i]->dev = dev; + mutex_unlock(&root[i]->chain_lock); + } + /* If you hit this error try destroying all flow tables and try again */ + mlx5_core_err(dev, "Failed to set root device for RDMA TRANSPORT\n"); + return -EINVAL; +} +EXPORT_SYMBOL(mlx5_fs_set_root_dev); + static int init_rdma_transport_rx_root_ns(struct mlx5_flow_steering *steering) { struct mlx5_core_dev *dev = steering->dev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 0a9a5ef34c21..1c6591425260 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -103,24 +103,6 @@ enum fs_node_type { FS_TYPE_FLOW_DEST }; -enum fs_flow_table_type { - FS_FT_NIC_RX = 0x0, - FS_FT_NIC_TX = 0x1, - FS_FT_ESW_EGRESS_ACL = 0x2, - FS_FT_ESW_INGRESS_ACL = 0x3, - FS_FT_FDB = 0X4, - FS_FT_SNIFFER_RX = 0X5, - FS_FT_SNIFFER_TX = 0X6, - FS_FT_RDMA_RX = 0X7, - FS_FT_RDMA_TX = 0X8, - FS_FT_PORT_SEL = 0X9, - FS_FT_FDB_RX = 0xa, - FS_FT_FDB_TX = 0xb, - FS_FT_RDMA_TRANSPORT_RX = 0xd, - FS_FT_RDMA_TRANSPORT_TX = 0xe, - FS_FT_MAX_TYPE = FS_FT_RDMA_TRANSPORT_TX, -}; - enum fs_flow_table_op_mod { FS_FT_OP_MOD_NORMAL, FS_FT_OP_MOD_LAG_DEMUX, diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 6325a7fa0df2..fe721557bd1d 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -128,6 +128,24 @@ enum { FDB_PER_VPORT, }; +enum fs_flow_table_type { + FS_FT_NIC_RX = 0x0, + FS_FT_NIC_TX = 0x1, + FS_FT_ESW_EGRESS_ACL = 0x2, + FS_FT_ESW_INGRESS_ACL = 0x3, + FS_FT_FDB = 0X4, + FS_FT_SNIFFER_RX = 0X5, + FS_FT_SNIFFER_TX = 0X6, + FS_FT_RDMA_RX = 0X7, + FS_FT_RDMA_TX = 0X8, + FS_FT_PORT_SEL = 0X9, + FS_FT_FDB_RX = 0xa, + FS_FT_FDB_TX = 0xb, + FS_FT_RDMA_TRANSPORT_RX = 0xd, + FS_FT_RDMA_TRANSPORT_TX = 0xe, + FS_FT_MAX_TYPE = FS_FT_RDMA_TRANSPORT_TX, +}; + struct mlx5_pkt_reformat; struct mlx5_modify_hdr; struct mlx5_flow_definer; @@ -355,4 +373,8 @@ u32 mlx5_flow_table_id(struct mlx5_flow_table *ft); struct mlx5_flow_root_namespace * mlx5_get_root_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type ns_type); + +int mlx5_fs_set_root_dev(struct mlx5_core_dev *dev, + struct mlx5_core_dev *new_dev, + enum fs_flow_table_type table_type); #endif -- cgit v1.2.3 From 1d165919c8261b927f8dc8cfe61eb04342bedb7e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 25 Oct 2025 19:47:59 -0700 Subject: iio: imu: adis: fix all kernel-doc warnings in header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correct and add to adis.h to resolve all kernel-doc warnings: - add a missing struct member description - change one non-kernel-doc comment to use /* instead of /** - correct function parameter @value to @val (7 locations) - add function return value comments (13 locations) Warning: include/linux/iio/imu/adis.h:97 struct member 'has_fifo' not described in 'adis_data' Warning: include/linux/iio/imu/adis.h:139 Incorrect use of kernel-doc format: * The state_lock is meant to be used during operations that require Warning: include/linux/iio/imu/adis.h:158 struct member '"__adis_"' not described in 'adis' Warning: include/linux/iio/imu/adis.h:264 function parameter 'val' not described in 'adis_write_reg' Warning: include/linux/iio/imu/adis.h:371 No description found for return value of 'adis_update_bits_base' Signed-off-by: Randy Dunlap Reviewed-by: Nuno Sá Reviewed-by: Andy Shevchenko Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 45 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index aa160511e265..bfb6df68e6c9 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -57,6 +57,7 @@ struct adis_timeout { * @enable_irq: Hook for ADIS devices that have a special IRQ enable/disable * @unmasked_drdy: True for devices that cannot mask/unmask the data ready pin * @has_paging: True if ADIS device has paged registers + * @has_fifo: True if ADIS device has a hardware FIFO * @burst_reg_cmd: Register command that triggers burst * @burst_len: Burst size in the SPI RX buffer. If @burst_max_len is defined, * this should be the minimum size supported by the device. @@ -136,7 +137,7 @@ struct adis { const struct adis_data *data; unsigned int burst_extra_len; const struct adis_ops *ops; - /** + /* * The state_lock is meant to be used during operations that require * a sequence of SPI R/W in order to protect the SPI transfer * information (fields 'xfer', 'msg' & 'current_page') between @@ -166,7 +167,7 @@ int __adis_reset(struct adis *adis); * adis_reset() - Reset the device * @adis: The adis device * - * Returns 0 on success, a negative error code otherwise + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_reset(struct adis *adis) { @@ -183,7 +184,9 @@ int __adis_read_reg(struct adis *adis, unsigned int reg, * __adis_write_reg_8() - Write single byte to a register (unlocked) * @adis: The adis device * @reg: The address of the register to be written - * @value: The value to write + * @val: The value to write + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg, u8 val) @@ -195,7 +198,9 @@ static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg, * __adis_write_reg_16() - Write 2 bytes to a pair of registers (unlocked) * @adis: The adis device * @reg: The address of the lower of the two registers - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg, u16 val) @@ -207,7 +212,9 @@ static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg, * __adis_write_reg_32() - write 4 bytes to four registers (unlocked) * @adis: The adis device * @reg: The address of the lower of the four register - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg, u32 val) @@ -220,6 +227,8 @@ static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg, u16 *val) @@ -239,6 +248,8 @@ static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg, u32 *val) @@ -257,8 +268,10 @@ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg, * adis_write_reg() - write N bytes to register * @adis: The adis device * @reg: The address of the lower of the two registers - * @value: The value to write to device (up to 4 bytes) + * @val: The value to write to device (up to 4 bytes) * @size: The size of the @value (in bytes) + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg(struct adis *adis, unsigned int reg, unsigned int val, unsigned int size) @@ -273,6 +286,8 @@ static inline int adis_write_reg(struct adis *adis, unsigned int reg, * @reg: The address of the lower of the two registers * @val: The value read back from the device * @size: The size of the @val buffer + * + * Returns: %0 on success, a negative error code otherwise */ static int adis_read_reg(struct adis *adis, unsigned int reg, unsigned int *val, unsigned int size) @@ -285,7 +300,9 @@ static int adis_read_reg(struct adis *adis, unsigned int reg, * adis_write_reg_8() - Write single byte to a register * @adis: The adis device * @reg: The address of the register to be written - * @value: The value to write + * @val: The value to write + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg_8(struct adis *adis, unsigned int reg, u8 val) @@ -297,7 +314,9 @@ static inline int adis_write_reg_8(struct adis *adis, unsigned int reg, * adis_write_reg_16() - Write 2 bytes to a pair of registers * @adis: The adis device * @reg: The address of the lower of the two registers - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg_16(struct adis *adis, unsigned int reg, u16 val) @@ -309,7 +328,9 @@ static inline int adis_write_reg_16(struct adis *adis, unsigned int reg, * adis_write_reg_32() - write 4 bytes to four registers * @adis: The adis device * @reg: The address of the lower of the four register - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg_32(struct adis *adis, unsigned int reg, u32 val) @@ -322,6 +343,8 @@ static inline int adis_write_reg_32(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_read_reg_16(struct adis *adis, unsigned int reg, u16 *val) @@ -341,6 +364,8 @@ static inline int adis_read_reg_16(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_read_reg_32(struct adis *adis, unsigned int reg, u32 *val) @@ -366,6 +391,8 @@ int __adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask, * @size: Size of the register to update * * Updates the desired bits of @reg in accordance with @mask and @val. + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask, const u32 val, u8 size) -- cgit v1.2.3 From 77008e1b2ef73249bceb078a321a3ff6bc087afb Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 16 Oct 2025 21:36:30 -0400 Subject: mm/huge_memory: do not change split_huge_page*() target order silently Page cache folios from a file system that support large block size (LBS) can have minimal folio order greater than 0, thus a high order folio might not be able to be split down to order-0. Commit e220917fa507 ("mm: split a folio in minimum folio order chunks") bumps the target order of split_huge_page*() to the minimum allowed order when splitting a LBS folio. This causes confusion for some split_huge_page*() callers like memory failure handling code, since they expect after-split folios all have order-0 when split succeeds but in reality get min_order_for_split() order folios and give warnings. Fix it by failing a split if the folio cannot be split to the target order. Rename try_folio_split() to try_folio_split_to_order() to reflect the added new_order parameter. Remove its unused list parameter. [The test poisons LBS folios, which cannot be split to order-0 folios, and also tries to poison all memory. The non split LBS folios take more memory than the test anticipated, leading to OOM. The patch fixed the kernel warning and the test needs some change to avoid OOM.] Link: https://lkml.kernel.org/r/20251017013630.139907-1-ziy@nvidia.com Fixes: e220917fa507 ("mm: split a folio in minimum folio order chunks") Signed-off-by: Zi Yan Reported-by: syzbot+e6367ea2fdab6ed46056@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68d2c943.a70a0220.1b52b.02b3.GAE@google.com/ Reviewed-by: Luis Chamberlain Reviewed-by: Pankaj Raghav Reviewed-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Miaohe Lin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jane Chu Cc: Lance Yang Cc: Liam Howlett Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Christian Brauner Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 55 +++++++++++++++++++++---------------------------- mm/huge_memory.c | 9 +------- mm/truncate.c | 6 ++++-- 3 files changed, 28 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f327d62fc985..71ac78b9f834 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -376,45 +376,30 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); /* - * try_folio_split - try to split a @folio at @page using non uniform split. + * try_folio_split_to_order - try to split a @folio at @page to @new_order using + * non uniform split. * @folio: folio to be split - * @page: split to order-0 at the given page - * @list: store the after-split folios + * @page: split to @new_order at the given page + * @new_order: the target split order * - * Try to split a @folio at @page using non uniform split to order-0, if - * non uniform split is not supported, fall back to uniform split. + * Try to split a @folio at @page using non uniform split to @new_order, if + * non uniform split is not supported, fall back to uniform split. After-split + * folios are put back to LRU list. Use min_order_for_split() to get the lower + * bound of @new_order. * * Return: 0: split is successful, otherwise split failed. */ -static inline int try_folio_split(struct folio *folio, struct page *page, - struct list_head *list) +static inline int try_folio_split_to_order(struct folio *folio, + struct page *page, unsigned int new_order) { - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - if (!non_uniform_split_supported(folio, 0, false)) - return split_huge_page_to_list_to_order(&folio->page, list, - ret); - return folio_split(folio, ret, page, list); + if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) + return split_huge_page_to_list_to_order(&folio->page, NULL, + new_order); + return folio_split(folio, new_order, page, NULL); } static inline int split_huge_page(struct page *page) { - struct folio *folio = page_folio(page); - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - /* - * split_huge_page() locks the page before splitting and - * expects the same page that has been split to be locked when - * returned. split_folio(page_folio(page)) cannot be used here - * because it converts the page to folio and passes the head - * page to be split. - */ - return split_huge_page_to_list_to_order(page, NULL, ret); + return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio, bool partially_mapped); @@ -597,14 +582,20 @@ static inline int split_huge_page(struct page *page) return -EINVAL; } +static inline int min_order_for_split(struct folio *folio) +{ + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; +} + static inline int split_folio_to_list(struct folio *folio, struct list_head *list) { VM_WARN_ON_ONCE_FOLIO(1, folio); return -EINVAL; } -static inline int try_folio_split(struct folio *folio, struct page *page, - struct list_head *list) +static inline int try_folio_split_to_order(struct folio *folio, + struct page *page, unsigned int new_order) { VM_WARN_ON_ONCE_FOLIO(1, folio); return -EINVAL; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1d1b74950332..feac4aef7dfb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3653,8 +3653,6 @@ static int __folio_split(struct folio *folio, unsigned int new_order, min_order = mapping_min_folio_order(folio->mapping); if (new_order < min_order) { - VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", - min_order); ret = -EINVAL; goto out; } @@ -3986,12 +3984,7 @@ int min_order_for_split(struct folio *folio) int split_folio_to_list(struct folio *folio, struct list_head *list) { - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - return split_huge_page_to_list_to_order(&folio->page, list, ret); + return split_huge_page_to_list_to_order(&folio->page, list, 0); } /* diff --git a/mm/truncate.c b/mm/truncate.c index 91eb92a5ce4f..9210cf808f5c 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -194,6 +194,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) size_t size = folio_size(folio); unsigned int offset, length; struct page *split_at, *split_at2; + unsigned int min_order; if (pos < start) offset = start - pos; @@ -223,8 +224,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) if (!folio_test_large(folio)) return true; + min_order = mapping_min_folio_order(folio->mapping); split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); - if (!try_folio_split(folio, split_at, NULL)) { + if (!try_folio_split_to_order(folio, split_at, min_order)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste @@ -254,7 +256,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) */ if (folio_test_large(folio2) && folio2->mapping == folio->mapping) - try_folio_split(folio2, split_at2, NULL); + try_folio_split_to_order(folio2, split_at2, min_order); folio_unlock(folio2); out: -- cgit v1.2.3 From fa759cd75bce5489eed34596daa53f721849a86f Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Mon, 20 Oct 2025 20:08:52 -0400 Subject: kho: allocate metadata directly from the buddy allocator KHO allocates metadata for its preserved memory map using the slab allocator via kzalloc(). This metadata is temporary and is used by the next kernel during early boot to find preserved memory. A problem arises when KFENCE is enabled. kzalloc() calls can be randomly intercepted by kfence_alloc(), which services the allocation from a dedicated KFENCE memory pool. This pool is allocated early in boot via memblock. When booting via KHO, the memblock allocator is restricted to a "scratch area", forcing the KFENCE pool to be allocated within it. This creates a conflict, as the scratch area is expected to be ephemeral and overwriteable by a subsequent kexec. If KHO metadata is placed in this KFENCE pool, it leads to memory corruption when the next kernel is loaded. To fix this, modify KHO to allocate its metadata directly from the buddy allocator instead of slab. Link: https://lkml.kernel.org/r/20251021000852.2924827-4-pasha.tatashin@soleen.com Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: David Matlack Cc: Alexander Graf Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Samiullah Khawaja Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- include/linux/gfp.h | 3 +++ kernel/kexec_handover.c | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0ceb4e09306c..623bee335383 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -7,6 +7,7 @@ #include #include #include +#include #include struct vm_area_struct; @@ -463,4 +464,6 @@ static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, /* This should be paired with folio_put() rather than free_contig_range(). */ #define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) +DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) + #endif /* __LINUX_GFP_H */ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 9217d2fdd2d3..2a8c20c238a8 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -142,7 +142,7 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) if (res) return res; - void *elm __free(kfree) = kzalloc(PAGE_SIZE, GFP_KERNEL); + void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); @@ -348,9 +348,9 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk __free(kfree) = NULL; + struct khoser_mem_chunk *chunk __free(free_page) = NULL; - chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + chunk = (void *)get_zeroed_page(GFP_KERNEL); if (!chunk) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 7cd3d204412b0584df38fd7be20002137f34721a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:23 +0100 Subject: ns: don't increment or decrement initial namespaces There's no need to bump the active reference counts of initial namespaces as they're always active and can simply remain at 1. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-2-ae8a4ad5a3b3@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 23 ++++++++++++++++++++--- kernel/nscommon.c | 6 ++++++ 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index bd4492ef6ffc..791b18dc77d0 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -141,6 +141,12 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); } +static __always_inline bool is_ns_init_id(const struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->ns_id == 0); + return ns->ns_id <= NS_LAST_INIT_ID; +} + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ @@ -285,14 +291,19 @@ void __ns_ref_active_get_owner(struct ns_common *ns); static __always_inline void __ns_ref_active_get(struct ns_common *ns) { - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); - VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0); + /* Initial namespaces are always active. */ + if (!is_ns_init_id(ns)) + WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); } #define ns_ref_active_get(__ns) \ do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return true; + if (atomic_inc_not_zero(&ns->__ns_ref_active)) { VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); return true; @@ -307,6 +318,10 @@ void __ns_ref_active_put_owner(struct ns_common *ns); static __always_inline void __ns_ref_active_put(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + if (atomic_dec_and_test(&ns->__ns_ref_active)) { VFS_WARN_ON_ONCE(is_initial_namespace(ns)); VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); @@ -319,8 +334,10 @@ static __always_inline void __ns_ref_active_put(struct ns_common *ns) static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); - if (!__ns_ref_active_read(ns)) + if (!__ns_ref_active_read(ns)) { + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); return NULL; + } if (!__ns_ref_get(ns)) return NULL; return ns; diff --git a/kernel/nscommon.c b/kernel/nscommon.c index d67ae7ad7759..70cb66232e4c 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -177,6 +177,7 @@ void __ns_ref_active_put_owner(struct ns_common *ns) ns = ns_owner(ns); if (!ns) return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); if (!atomic_dec_and_test(&ns->__ns_ref_active)) return; } @@ -276,6 +277,10 @@ void __ns_ref_active_put_owner(struct ns_common *ns) */ void __ns_ref_active_resurrect(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + /* If we didn't resurrect the namespace we're done. */ if (atomic_fetch_add(1, &ns->__ns_ref_active)) return; @@ -289,6 +294,7 @@ void __ns_ref_active_resurrect(struct ns_common *ns) if (!ns) return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); if (atomic_fetch_add(1, &ns->__ns_ref_active)) return; } -- cgit v1.2.3 From f8d5a8970d2f49411824fb1fdd34bbb3eea22756 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:26 +0100 Subject: ns: handle setns(pidfd, ...) cleanly The setns() system call supports: (1) namespace file descriptors (nsfd) (2) process file descriptors (pidfd) When using nsfds the namespaces will remain active because they are pinned by the vfs. However, when pidfds are used things are more complicated. When the target task exits and passes through exit_nsproxy_namespaces() or is reaped and thus also passes through exit_cred_namespaces() after the setns()'ing task has called prepare_nsset() but before the active reference count of the set of namespaces it wants to setns() to might have been dropped already: P1 P2 pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) pidfd = pidfd_open(pid_p1) setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) prepare_nsset() exit(0) // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 1 -> exit_nsproxy_namespaces() -> exit_cred_namespaces() // ns_active_ref_put() will also put // the reference on the owner of the // namespace. If the only reason the // owning namespace was alive was // because it was a parent of @ns // it's active reference count now goes // to zero... -------------------------------- // | // ns->__ns_active_ref == 0 | // parent_ns->__ns_active_ref == 0 | | commit_nsset() -----------------> // If setns() // now manages to install the namespaces // it will call ns_active_ref_get() // on them thus bumping the active reference // count from zero again but without also // taking the required reference on the owner. // Thus we get: // // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 0 When later someone does ns_active_ref_put() on @ns it will underflow parent_ns->__ns_active_ref leading to a splat from our asserts thinking there are still active references when in fact the counter just underflowed. So resurrect the ownership chain if necessary as well. If the caller succeeded to grab passive references to the set of namespaces the setns() should simply succeed even if the target task exists or gets reaped in the meantime and thus has dropped all active references to its namespaces. The race is rare and can only be triggered when using pidfs to setns() to namespaces. Also note that active reference on initial namespaces are nops. Since we now always handle parent references directly we can drop ns_ref_active_get_owner() when adding a namespace to a namespace tree. This is now all handled uniformly in the places where the new namespaces actually become active. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-5-ae8a4ad5a3b3@kernel.org Fixes: 3c9820d5c64a ("ns: add active reference count") Reported-by: syzbot+1957b26299cf3ff7890c@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/nsfs.c | 2 +- include/linux/ns_common.h | 47 ++++------------------------------------------- kernel/nscommon.c | 21 ++++++++++++--------- kernel/nstree.c | 8 -------- 4 files changed, 17 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/fs/nsfs.c b/fs/nsfs.c index ba6c8975c82e..a80f8d2a4122 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -430,7 +430,7 @@ static int nsfs_init_inode(struct inode *inode, void *data) * ioctl on such a socket will resurrect the relevant namespace * subtree. */ - __ns_ref_active_resurrect(ns); + __ns_ref_active_get(ns); return 0; } diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 791b18dc77d0..3aaba2ca31d7 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -287,47 +287,8 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns #define ns_ref_active_read(__ns) \ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) -void __ns_ref_active_get_owner(struct ns_common *ns); +void __ns_ref_active_put(struct ns_common *ns); -static __always_inline void __ns_ref_active_get(struct ns_common *ns) -{ - /* Initial namespaces are always active. */ - if (!is_ns_init_id(ns)) - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); -} -#define ns_ref_active_get(__ns) \ - do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) - -static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) -{ - /* Initial namespaces are always active. */ - if (is_ns_init_id(ns)) - return true; - - if (atomic_inc_not_zero(&ns->__ns_ref_active)) { - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); - return true; - } - return false; -} - -#define ns_ref_active_get_owner(__ns) \ - do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0) - -void __ns_ref_active_put_owner(struct ns_common *ns); - -static __always_inline void __ns_ref_active_put(struct ns_common *ns) -{ - /* Initial namespaces are always active. */ - if (is_ns_init_id(ns)) - return; - - if (atomic_dec_and_test(&ns->__ns_ref_active)) { - VFS_WARN_ON_ONCE(is_initial_namespace(ns)); - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); - __ns_ref_active_put_owner(ns); - } -} #define ns_ref_active_put(__ns) \ do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) @@ -343,9 +304,9 @@ static __always_inline struct ns_common *__must_check ns_get_unless_inactive(str return ns; } -void __ns_ref_active_resurrect(struct ns_common *ns); +void __ns_ref_active_get(struct ns_common *ns); -#define ns_ref_active_resurrect(__ns) \ - do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0) +#define ns_ref_active_get(__ns) \ + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) #endif diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 70cb66232e4c..bfd2d6805776 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -114,13 +114,6 @@ struct ns_common *__must_check ns_owner(struct ns_common *ns) return to_ns_common(owner); } -void __ns_ref_active_get_owner(struct ns_common *ns) -{ - ns = ns_owner(ns); - if (ns) - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); -} - /* * The active reference count works by having each namespace that gets * created take a single active reference on its owning user namespace. @@ -171,8 +164,18 @@ void __ns_ref_active_get_owner(struct ns_common *ns) * The iteration stops once we reach a namespace that still has active * references. */ -void __ns_ref_active_put_owner(struct ns_common *ns) +void __ns_ref_active_put(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + if (!atomic_dec_and_test(&ns->__ns_ref_active)) + return; + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + for (;;) { ns = ns_owner(ns); if (!ns) @@ -275,7 +278,7 @@ void __ns_ref_active_put_owner(struct ns_common *ns) * it also needs to take another reference on its owning user namespace * and so on. */ -void __ns_ref_active_resurrect(struct ns_common *ns) +void __ns_ref_active_get(struct ns_common *ns) { /* Initial namespaces are always active. */ if (is_ns_init_id(ns)) diff --git a/kernel/nstree.c b/kernel/nstree.c index f27f772a6762..97404fb90749 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -173,14 +173,6 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_sequnlock(&ns_tree_lock); VFS_WARN_ON_ONCE(node); - - /* - * Take an active reference on the owner namespace. This ensures - * that the owner remains visible while any of its child namespaces - * are active. For init namespaces this is a no-op as ns_owner() - * returns NULL for namespaces owned by init_user_ns. - */ - __ns_ref_active_get_owner(ns); } void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) -- cgit v1.2.3 From 57b39aabb99ea69b9046df2915404a931d9d6695 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:27 +0100 Subject: ns: add asserts for active refcount underflow Add a few more assert to detect active reference count underflows. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-6-ae8a4ad5a3b3@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 1 - kernel/nscommon.c | 18 ++++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 3aaba2ca31d7..66ea09b48377 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -294,7 +294,6 @@ void __ns_ref_active_put(struct ns_common *ns); static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) { - VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); if (!__ns_ref_active_read(ns)) { VFS_WARN_ON_ONCE(is_ns_init_id(ns)); return NULL; diff --git a/kernel/nscommon.c b/kernel/nscommon.c index bfd2d6805776..c910b979e433 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -170,8 +170,10 @@ void __ns_ref_active_put(struct ns_common *ns) if (is_ns_init_id(ns)) return; - if (!atomic_dec_and_test(&ns->__ns_ref_active)) + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; + } VFS_WARN_ON_ONCE(is_ns_init_id(ns)); VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); @@ -181,8 +183,10 @@ void __ns_ref_active_put(struct ns_common *ns) if (!ns) return; VFS_WARN_ON_ONCE(is_ns_init_id(ns)); - if (!atomic_dec_and_test(&ns->__ns_ref_active)) + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; + } } } @@ -280,12 +284,16 @@ void __ns_ref_active_put(struct ns_common *ns) */ void __ns_ref_active_get(struct ns_common *ns) { + int prev; + /* Initial namespaces are always active. */ if (is_ns_init_id(ns)) return; /* If we didn't resurrect the namespace we're done. */ - if (atomic_fetch_add(1, &ns->__ns_ref_active)) + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) return; /* @@ -298,7 +306,9 @@ void __ns_ref_active_get(struct ns_common *ns) return; VFS_WARN_ON_ONCE(is_ns_init_id(ns)); - if (atomic_fetch_add(1, &ns->__ns_ref_active)) + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) return; } } -- cgit v1.2.3 From 69674282fc97fffd98a85ab5b4837edbc5898145 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:49 +0100 Subject: wifi: ieee80211: split mesh definitions out The ieee80211.h file has gotten very long, start splitting it by putting mesh definitions into a separate file. Link: https://patch.msgid.link/20251105153843.489713ca8b34.I3befb4bf6ace0315758a1794224ddd18c4652e32@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-mesh.h | 230 +++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 211 +------------------------------------ 2 files changed, 232 insertions(+), 209 deletions(-) create mode 100644 include/linux/ieee80211-mesh.h (limited to 'include/linux') diff --git a/include/linux/ieee80211-mesh.h b/include/linux/ieee80211-mesh.h new file mode 100644 index 000000000000..4b829bcb38b6 --- /dev/null +++ b/include/linux/ieee80211-mesh.h @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 mesh definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_MESH_H +#define LINUX_IEEE80211_MESH_H + +#include +#include + +#define IEEE80211_MAX_MESH_ID_LEN 32 + +struct ieee80211s_hdr { + u8 flags; + u8 ttl; + __le32 seqnum; + u8 eaddr1[ETH_ALEN]; + u8 eaddr2[ETH_ALEN]; +} __packed __aligned(2); + +/* Mesh flags */ +#define MESH_FLAGS_AE_A4 0x1 +#define MESH_FLAGS_AE_A5_A6 0x2 +#define MESH_FLAGS_AE 0x3 +#define MESH_FLAGS_PS_DEEP 0x4 + +/** + * enum ieee80211_preq_flags - mesh PREQ element flags + * + * @IEEE80211_PREQ_PROACTIVE_PREP_FLAG: proactive PREP subfield + */ +enum ieee80211_preq_flags { + IEEE80211_PREQ_PROACTIVE_PREP_FLAG = 1<<2, +}; + +/** + * enum ieee80211_preq_target_flags - mesh PREQ element per target flags + * + * @IEEE80211_PREQ_TO_FLAG: target only subfield + * @IEEE80211_PREQ_USN_FLAG: unknown target HWMP sequence number subfield + */ +enum ieee80211_preq_target_flags { + IEEE80211_PREQ_TO_FLAG = 1<<0, + IEEE80211_PREQ_USN_FLAG = 1<<2, +}; + +/** + * struct ieee80211_mesh_chansw_params_ie - mesh channel switch parameters IE + * @mesh_ttl: Time To Live + * @mesh_flags: Flags + * @mesh_reason: Reason Code + * @mesh_pre_value: Precedence Value + * + * This structure represents the payload of the "Mesh Channel Switch + * Parameters element" as described in IEEE Std 802.11-2020 section + * 9.4.2.102. + */ +struct ieee80211_mesh_chansw_params_ie { + u8 mesh_ttl; + u8 mesh_flags; + __le16 mesh_reason; + __le16 mesh_pre_value; +} __packed; + +/** + * struct ieee80211_meshconf_ie - Mesh Configuration element + * @meshconf_psel: Active Path Selection Protocol Identifier + * @meshconf_pmetric: Active Path Selection Metric Identifier + * @meshconf_congest: Congestion Control Mode Identifier + * @meshconf_synch: Synchronization Method Identifier + * @meshconf_auth: Authentication Protocol Identifier + * @meshconf_form: Mesh Formation Info + * @meshconf_cap: Mesh Capability (see &enum mesh_config_capab_flags) + * + * This structure represents the payload of the "Mesh Configuration + * element" as described in IEEE Std 802.11-2020 section 9.4.2.97. + */ +struct ieee80211_meshconf_ie { + u8 meshconf_psel; + u8 meshconf_pmetric; + u8 meshconf_congest; + u8 meshconf_synch; + u8 meshconf_auth; + u8 meshconf_form; + u8 meshconf_cap; +} __packed; + +/** + * enum mesh_config_capab_flags - Mesh Configuration IE capability field flags + * + * @IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS: STA is willing to establish + * additional mesh peerings with other mesh STAs + * @IEEE80211_MESHCONF_CAPAB_FORWARDING: the STA forwards MSDUs + * @IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING: TBTT adjustment procedure + * is ongoing + * @IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL: STA is in deep sleep mode or has + * neighbors in deep sleep mode + * + * Enumerates the "Mesh Capability" as described in IEEE Std + * 802.11-2020 section 9.4.2.97.7. + */ +enum mesh_config_capab_flags { + IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS = 0x01, + IEEE80211_MESHCONF_CAPAB_FORWARDING = 0x08, + IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING = 0x20, + IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL = 0x40, +}; + +#define IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE 0x1 + +/* + * mesh channel switch parameters element's flag indicator + * + */ +#define WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT BIT(0) +#define WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR BIT(1) +#define WLAN_EID_CHAN_SWITCH_PARAM_REASON BIT(2) + +/** + * struct ieee80211_rann_ie - RANN (root announcement) element + * @rann_flags: Flags + * @rann_hopcount: Hop Count + * @rann_ttl: Element TTL + * @rann_addr: Root Mesh STA Address + * @rann_seq: HWMP Sequence Number + * @rann_interval: Interval + * @rann_metric: Metric + * + * This structure represents the payload of the "RANN element" as + * described in IEEE Std 802.11-2020 section 9.4.2.111. + */ +struct ieee80211_rann_ie { + u8 rann_flags; + u8 rann_hopcount; + u8 rann_ttl; + u8 rann_addr[ETH_ALEN]; + __le32 rann_seq; + __le32 rann_interval; + __le32 rann_metric; +} __packed; + +enum ieee80211_rann_flags { + RANN_FLAG_IS_GATE = 1 << 0, +}; + +/* Mesh action codes */ +enum ieee80211_mesh_actioncode { + WLAN_MESH_ACTION_LINK_METRIC_REPORT, + WLAN_MESH_ACTION_HWMP_PATH_SELECTION, + WLAN_MESH_ACTION_GATE_ANNOUNCEMENT, + WLAN_MESH_ACTION_CONGESTION_CONTROL_NOTIFICATION, + WLAN_MESH_ACTION_MCCA_SETUP_REQUEST, + WLAN_MESH_ACTION_MCCA_SETUP_REPLY, + WLAN_MESH_ACTION_MCCA_ADVERTISEMENT_REQUEST, + WLAN_MESH_ACTION_MCCA_ADVERTISEMENT, + WLAN_MESH_ACTION_MCCA_TEARDOWN, + WLAN_MESH_ACTION_TBTT_ADJUSTMENT_REQUEST, + WLAN_MESH_ACTION_TBTT_ADJUSTMENT_RESPONSE, +}; + +/** + * enum ieee80211_mesh_sync_method - mesh synchronization method identifier + * + * @IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET: the default synchronization method + * @IEEE80211_SYNC_METHOD_VENDOR: a vendor specific synchronization method + * that will be specified in a vendor specific information element + */ +enum ieee80211_mesh_sync_method { + IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET = 1, + IEEE80211_SYNC_METHOD_VENDOR = 255, +}; + +/** + * enum ieee80211_mesh_path_protocol - mesh path selection protocol identifier + * + * @IEEE80211_PATH_PROTOCOL_HWMP: the default path selection protocol + * @IEEE80211_PATH_PROTOCOL_VENDOR: a vendor specific protocol that will + * be specified in a vendor specific information element + */ +enum ieee80211_mesh_path_protocol { + IEEE80211_PATH_PROTOCOL_HWMP = 1, + IEEE80211_PATH_PROTOCOL_VENDOR = 255, +}; + +/** + * enum ieee80211_mesh_path_metric - mesh path selection metric identifier + * + * @IEEE80211_PATH_METRIC_AIRTIME: the default path selection metric + * @IEEE80211_PATH_METRIC_VENDOR: a vendor specific metric that will be + * specified in a vendor specific information element + */ +enum ieee80211_mesh_path_metric { + IEEE80211_PATH_METRIC_AIRTIME = 1, + IEEE80211_PATH_METRIC_VENDOR = 255, +}; + +/** + * enum ieee80211_root_mode_identifier - root mesh STA mode identifier + * + * These attribute are used by dot11MeshHWMPRootMode to set root mesh STA mode + * + * @IEEE80211_ROOTMODE_NO_ROOT: the mesh STA is not a root mesh STA (default) + * @IEEE80211_ROOTMODE_ROOT: the mesh STA is a root mesh STA if greater than + * this value + * @IEEE80211_PROACTIVE_PREQ_NO_PREP: the mesh STA is a root mesh STA supports + * the proactive PREQ with proactive PREP subfield set to 0 + * @IEEE80211_PROACTIVE_PREQ_WITH_PREP: the mesh STA is a root mesh STA + * supports the proactive PREQ with proactive PREP subfield set to 1 + * @IEEE80211_PROACTIVE_RANN: the mesh STA is a root mesh STA supports + * the proactive RANN + */ +enum ieee80211_root_mode_identifier { + IEEE80211_ROOTMODE_NO_ROOT = 0, + IEEE80211_ROOTMODE_ROOT = 1, + IEEE80211_PROACTIVE_PREQ_NO_PREP = 2, + IEEE80211_PROACTIVE_PREQ_WITH_PREP = 3, + IEEE80211_PROACTIVE_RANN = 4, +}; + +#endif /* LINUX_IEEE80211_MESH_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index ddff9102f633..fe78b150ab45 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -252,8 +252,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_SSID_LEN 32 -#define IEEE80211_MAX_MESH_ID_LEN 32 - #define IEEE80211_FIRST_TSPEC_TSID 8 #define IEEE80211_NUM_TIDS 16 @@ -881,40 +879,6 @@ static inline u16 ieee80211_get_sn(struct ieee80211_hdr *hdr) return le16_get_bits(hdr->seq_ctrl, IEEE80211_SCTL_SEQ); } -struct ieee80211s_hdr { - u8 flags; - u8 ttl; - __le32 seqnum; - u8 eaddr1[ETH_ALEN]; - u8 eaddr2[ETH_ALEN]; -} __packed __aligned(2); - -/* Mesh flags */ -#define MESH_FLAGS_AE_A4 0x1 -#define MESH_FLAGS_AE_A5_A6 0x2 -#define MESH_FLAGS_AE 0x3 -#define MESH_FLAGS_PS_DEEP 0x4 - -/** - * enum ieee80211_preq_flags - mesh PREQ element flags - * - * @IEEE80211_PREQ_PROACTIVE_PREP_FLAG: proactive PREP subfield - */ -enum ieee80211_preq_flags { - IEEE80211_PREQ_PROACTIVE_PREP_FLAG = 1<<2, -}; - -/** - * enum ieee80211_preq_target_flags - mesh PREQ element per target flags - * - * @IEEE80211_PREQ_TO_FLAG: target only subfield - * @IEEE80211_PREQ_USN_FLAG: unknown target HWMP sequence number subfield - */ -enum ieee80211_preq_target_flags { - IEEE80211_PREQ_TO_FLAG = 1<<0, - IEEE80211_PREQ_USN_FLAG = 1<<2, -}; - /** * struct ieee80211_quiet_ie - Quiet element * @count: Quiet Count @@ -993,24 +957,6 @@ struct ieee80211_sec_chan_offs_ie { u8 sec_chan_offs; } __packed; -/** - * struct ieee80211_mesh_chansw_params_ie - mesh channel switch parameters IE - * @mesh_ttl: Time To Live - * @mesh_flags: Flags - * @mesh_reason: Reason Code - * @mesh_pre_value: Precedence Value - * - * This structure represents the payload of the "Mesh Channel Switch - * Parameters element" as described in IEEE Std 802.11-2020 section - * 9.4.2.102. - */ -struct ieee80211_mesh_chansw_params_ie { - u8 mesh_ttl; - u8 mesh_flags; - __le16 mesh_reason; - __le16 mesh_pre_value; -} __packed; - /** * struct ieee80211_wide_bw_chansw_ie - wide bandwidth channel switch IE * @new_channel_width: New Channel Width @@ -1051,87 +997,6 @@ struct ieee80211_tim_ie { }; } __packed; -/** - * struct ieee80211_meshconf_ie - Mesh Configuration element - * @meshconf_psel: Active Path Selection Protocol Identifier - * @meshconf_pmetric: Active Path Selection Metric Identifier - * @meshconf_congest: Congestion Control Mode Identifier - * @meshconf_synch: Synchronization Method Identifier - * @meshconf_auth: Authentication Protocol Identifier - * @meshconf_form: Mesh Formation Info - * @meshconf_cap: Mesh Capability (see &enum mesh_config_capab_flags) - * - * This structure represents the payload of the "Mesh Configuration - * element" as described in IEEE Std 802.11-2020 section 9.4.2.97. - */ -struct ieee80211_meshconf_ie { - u8 meshconf_psel; - u8 meshconf_pmetric; - u8 meshconf_congest; - u8 meshconf_synch; - u8 meshconf_auth; - u8 meshconf_form; - u8 meshconf_cap; -} __packed; - -/** - * enum mesh_config_capab_flags - Mesh Configuration IE capability field flags - * - * @IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS: STA is willing to establish - * additional mesh peerings with other mesh STAs - * @IEEE80211_MESHCONF_CAPAB_FORWARDING: the STA forwards MSDUs - * @IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING: TBTT adjustment procedure - * is ongoing - * @IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL: STA is in deep sleep mode or has - * neighbors in deep sleep mode - * - * Enumerates the "Mesh Capability" as described in IEEE Std - * 802.11-2020 section 9.4.2.97.7. - */ -enum mesh_config_capab_flags { - IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS = 0x01, - IEEE80211_MESHCONF_CAPAB_FORWARDING = 0x08, - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING = 0x20, - IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL = 0x40, -}; - -#define IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE 0x1 - -/* - * mesh channel switch parameters element's flag indicator - * - */ -#define WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT BIT(0) -#define WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR BIT(1) -#define WLAN_EID_CHAN_SWITCH_PARAM_REASON BIT(2) - -/** - * struct ieee80211_rann_ie - RANN (root announcement) element - * @rann_flags: Flags - * @rann_hopcount: Hop Count - * @rann_ttl: Element TTL - * @rann_addr: Root Mesh STA Address - * @rann_seq: HWMP Sequence Number - * @rann_interval: Interval - * @rann_metric: Metric - * - * This structure represents the payload of the "RANN element" as - * described in IEEE Std 802.11-2020 section 9.4.2.111. - */ -struct ieee80211_rann_ie { - u8 rann_flags; - u8 rann_hopcount; - u8 rann_ttl; - u8 rann_addr[ETH_ALEN]; - __le32 rann_seq; - __le32 rann_interval; - __le32 rann_metric; -} __packed; - -enum ieee80211_rann_flags { - RANN_FLAG_IS_GATE = 1 << 0, -}; - enum ieee80211_ht_chanwidth_values { IEEE80211_HT_CHANWIDTH_20MHZ = 0, IEEE80211_HT_CHANWIDTH_ANY = 1, @@ -3971,21 +3836,6 @@ enum ieee80211_self_protected_actioncode { WLAN_SP_MGK_ACK = 5, }; -/* Mesh action codes */ -enum ieee80211_mesh_actioncode { - WLAN_MESH_ACTION_LINK_METRIC_REPORT, - WLAN_MESH_ACTION_HWMP_PATH_SELECTION, - WLAN_MESH_ACTION_GATE_ANNOUNCEMENT, - WLAN_MESH_ACTION_CONGESTION_CONTROL_NOTIFICATION, - WLAN_MESH_ACTION_MCCA_SETUP_REQUEST, - WLAN_MESH_ACTION_MCCA_SETUP_REPLY, - WLAN_MESH_ACTION_MCCA_ADVERTISEMENT_REQUEST, - WLAN_MESH_ACTION_MCCA_ADVERTISEMENT, - WLAN_MESH_ACTION_MCCA_TEARDOWN, - WLAN_MESH_ACTION_TBTT_ADJUSTMENT_REQUEST, - WLAN_MESH_ACTION_TBTT_ADJUSTMENT_RESPONSE, -}; - /* Unprotected WNM action codes */ enum ieee80211_unprotected_wnm_actioncode { WLAN_UNPROTECTED_WNM_ACTION_TIM = 0, @@ -4198,65 +4048,6 @@ enum ieee80211_tdls_actioncode { /* BSS Coex IE information field bits */ #define WLAN_BSS_COEX_INFORMATION_REQUEST BIT(0) -/** - * enum ieee80211_mesh_sync_method - mesh synchronization method identifier - * - * @IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET: the default synchronization method - * @IEEE80211_SYNC_METHOD_VENDOR: a vendor specific synchronization method - * that will be specified in a vendor specific information element - */ -enum ieee80211_mesh_sync_method { - IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET = 1, - IEEE80211_SYNC_METHOD_VENDOR = 255, -}; - -/** - * enum ieee80211_mesh_path_protocol - mesh path selection protocol identifier - * - * @IEEE80211_PATH_PROTOCOL_HWMP: the default path selection protocol - * @IEEE80211_PATH_PROTOCOL_VENDOR: a vendor specific protocol that will - * be specified in a vendor specific information element - */ -enum ieee80211_mesh_path_protocol { - IEEE80211_PATH_PROTOCOL_HWMP = 1, - IEEE80211_PATH_PROTOCOL_VENDOR = 255, -}; - -/** - * enum ieee80211_mesh_path_metric - mesh path selection metric identifier - * - * @IEEE80211_PATH_METRIC_AIRTIME: the default path selection metric - * @IEEE80211_PATH_METRIC_VENDOR: a vendor specific metric that will be - * specified in a vendor specific information element - */ -enum ieee80211_mesh_path_metric { - IEEE80211_PATH_METRIC_AIRTIME = 1, - IEEE80211_PATH_METRIC_VENDOR = 255, -}; - -/** - * enum ieee80211_root_mode_identifier - root mesh STA mode identifier - * - * These attribute are used by dot11MeshHWMPRootMode to set root mesh STA mode - * - * @IEEE80211_ROOTMODE_NO_ROOT: the mesh STA is not a root mesh STA (default) - * @IEEE80211_ROOTMODE_ROOT: the mesh STA is a root mesh STA if greater than - * this value - * @IEEE80211_PROACTIVE_PREQ_NO_PREP: the mesh STA is a root mesh STA supports - * the proactive PREQ with proactive PREP subfield set to 0 - * @IEEE80211_PROACTIVE_PREQ_WITH_PREP: the mesh STA is a root mesh STA - * supports the proactive PREQ with proactive PREP subfield set to 1 - * @IEEE80211_PROACTIVE_RANN: the mesh STA is a root mesh STA supports - * the proactive RANN - */ -enum ieee80211_root_mode_identifier { - IEEE80211_ROOTMODE_NO_ROOT = 0, - IEEE80211_ROOTMODE_ROOT = 1, - IEEE80211_PROACTIVE_PREQ_NO_PREP = 2, - IEEE80211_PROACTIVE_PREQ_WITH_PREP = 3, - IEEE80211_PROACTIVE_RANN = 4, -}; - /* * IEEE 802.11-2007 7.3.2.9 Country information element * @@ -6098,4 +5889,6 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) #define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 #define NAN_DEV_CAPA_S3_SUPPORTED 0x10 +#include "ieee80211-mesh.h" + #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From fdc1c141f3ef4dc94e3880e973061681843f62c0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:50 +0100 Subject: wifi: ieee80211: split HT definitions out The ieee80211.h file has gotten very long, continue splitting it by putting HT definitions into a separate file. Link: https://patch.msgid.link/20251105153843.7532471178d0.Id956a5433ad8658e4e5c0272dbcbb59587206142@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-ht.h | 292 +++++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 272 +--------------------------------------- 2 files changed, 293 insertions(+), 271 deletions(-) create mode 100644 include/linux/ieee80211-ht.h (limited to 'include/linux') diff --git a/include/linux/ieee80211-ht.h b/include/linux/ieee80211-ht.h new file mode 100644 index 000000000000..21bbf470540f --- /dev/null +++ b/include/linux/ieee80211-ht.h @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 HT definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_HT_H +#define LINUX_IEEE80211_HT_H + +#include +#include + +/* Maximal size of an A-MSDU that can be transported in a HT BA session */ +#define IEEE80211_MAX_MPDU_LEN_HT_BA 4095 + +/* Maximal size of an A-MSDU */ +#define IEEE80211_MAX_MPDU_LEN_HT_3839 3839 +#define IEEE80211_MAX_MPDU_LEN_HT_7935 7935 + +#define IEEE80211_HT_CTL_LEN 4 + +enum ieee80211_ht_chanwidth_values { + IEEE80211_HT_CHANWIDTH_20MHZ = 0, + IEEE80211_HT_CHANWIDTH_ANY = 1, +}; + +/** + * struct ieee80211_bar - Block Ack Request frame format + * @frame_control: Frame Control + * @duration: Duration + * @ra: RA + * @ta: TA + * @control: BAR Control + * @start_seq_num: Starting Sequence Number (see Figure 9-37) + * + * This structure represents the "BlockAckReq frame format" + * as described in IEEE Std 802.11-2020 section 9.3.1.7. +*/ +struct ieee80211_bar { + __le16 frame_control; + __le16 duration; + __u8 ra[ETH_ALEN]; + __u8 ta[ETH_ALEN]; + __le16 control; + __le16 start_seq_num; +} __packed; + +/* 802.11 BAR control masks */ +#define IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL 0x0000 +#define IEEE80211_BAR_CTRL_MULTI_TID 0x0002 +#define IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA 0x0004 +#define IEEE80211_BAR_CTRL_TID_INFO_MASK 0xf000 +#define IEEE80211_BAR_CTRL_TID_INFO_SHIFT 12 + +#define IEEE80211_HT_MCS_MASK_LEN 10 + +/** + * struct ieee80211_mcs_info - Supported MCS Set field + * @rx_mask: RX mask + * @rx_highest: highest supported RX rate. If set represents + * the highest supported RX data rate in units of 1 Mbps. + * If this field is 0 this value should not be used to + * consider the highest RX data rate supported. + * @tx_params: TX parameters + * @reserved: Reserved bits + * + * This structure represents the "Supported MCS Set field" as + * described in IEEE Std 802.11-2020 section 9.4.2.55.4. + */ +struct ieee80211_mcs_info { + u8 rx_mask[IEEE80211_HT_MCS_MASK_LEN]; + __le16 rx_highest; + u8 tx_params; + u8 reserved[3]; +} __packed; + +/* 802.11n HT capability MSC set */ +#define IEEE80211_HT_MCS_RX_HIGHEST_MASK 0x3ff +#define IEEE80211_HT_MCS_TX_DEFINED 0x01 +#define IEEE80211_HT_MCS_TX_RX_DIFF 0x02 +/* value 0 == 1 stream etc */ +#define IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK 0x0C +#define IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT 2 +#define IEEE80211_HT_MCS_TX_MAX_STREAMS 4 +#define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION 0x10 + +#define IEEE80211_HT_MCS_CHAINS(mcs) ((mcs) == 32 ? 1 : (1 + ((mcs) >> 3))) + +/* + * 802.11n D5.0 20.3.5 / 20.6 says: + * - indices 0 to 7 and 32 are single spatial stream + * - 8 to 31 are multiple spatial streams using equal modulation + * [8..15 for two streams, 16..23 for three and 24..31 for four] + * - remainder are multiple spatial streams using unequal modulation + */ +#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START 33 +#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE \ + (IEEE80211_HT_MCS_UNEQUAL_MODULATION_START / 8) + +/** + * struct ieee80211_ht_cap - HT capabilities element + * @cap_info: HT Capability Information + * @ampdu_params_info: A-MPDU Parameters + * @mcs: Supported MCS Set + * @extended_ht_cap_info: HT Extended Capabilities + * @tx_BF_cap_info: Transmit Beamforming Capabilities + * @antenna_selection_info: ASEL Capability + * + * This structure represents the payload of the "HT Capabilities + * element" as described in IEEE Std 802.11-2020 section 9.4.2.55. + */ +struct ieee80211_ht_cap { + __le16 cap_info; + u8 ampdu_params_info; + + /* 16 bytes MCS information */ + struct ieee80211_mcs_info mcs; + + __le16 extended_ht_cap_info; + __le32 tx_BF_cap_info; + u8 antenna_selection_info; +} __packed; + +/* 802.11n HT capabilities masks (for cap_info) */ +#define IEEE80211_HT_CAP_LDPC_CODING 0x0001 +#define IEEE80211_HT_CAP_SUP_WIDTH_20_40 0x0002 +#define IEEE80211_HT_CAP_SM_PS 0x000C +#define IEEE80211_HT_CAP_SM_PS_SHIFT 2 +#define IEEE80211_HT_CAP_GRN_FLD 0x0010 +#define IEEE80211_HT_CAP_SGI_20 0x0020 +#define IEEE80211_HT_CAP_SGI_40 0x0040 +#define IEEE80211_HT_CAP_TX_STBC 0x0080 +#define IEEE80211_HT_CAP_RX_STBC 0x0300 +#define IEEE80211_HT_CAP_RX_STBC_SHIFT 8 +#define IEEE80211_HT_CAP_DELAY_BA 0x0400 +#define IEEE80211_HT_CAP_MAX_AMSDU 0x0800 +#define IEEE80211_HT_CAP_DSSSCCK40 0x1000 +#define IEEE80211_HT_CAP_RESERVED 0x2000 +#define IEEE80211_HT_CAP_40MHZ_INTOLERANT 0x4000 +#define IEEE80211_HT_CAP_LSIG_TXOP_PROT 0x8000 + +/* 802.11n HT extended capabilities masks (for extended_ht_cap_info) */ +#define IEEE80211_HT_EXT_CAP_PCO 0x0001 +#define IEEE80211_HT_EXT_CAP_PCO_TIME 0x0006 +#define IEEE80211_HT_EXT_CAP_PCO_TIME_SHIFT 1 +#define IEEE80211_HT_EXT_CAP_MCS_FB 0x0300 +#define IEEE80211_HT_EXT_CAP_MCS_FB_SHIFT 8 +#define IEEE80211_HT_EXT_CAP_HTC_SUP 0x0400 +#define IEEE80211_HT_EXT_CAP_RD_RESPONDER 0x0800 + +/* 802.11n HT capability AMPDU settings (for ampdu_params_info) */ +#define IEEE80211_HT_AMPDU_PARM_FACTOR 0x03 +#define IEEE80211_HT_AMPDU_PARM_DENSITY 0x1C +#define IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT 2 + +/* + * Maximum length of AMPDU that the STA can receive in high-throughput (HT). + * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets) + */ +enum ieee80211_max_ampdu_length_exp { + IEEE80211_HT_MAX_AMPDU_8K = 0, + IEEE80211_HT_MAX_AMPDU_16K = 1, + IEEE80211_HT_MAX_AMPDU_32K = 2, + IEEE80211_HT_MAX_AMPDU_64K = 3 +}; + +#define IEEE80211_HT_MAX_AMPDU_FACTOR 13 + +/* Minimum MPDU start spacing */ +enum ieee80211_min_mpdu_spacing { + IEEE80211_HT_MPDU_DENSITY_NONE = 0, /* No restriction */ + IEEE80211_HT_MPDU_DENSITY_0_25 = 1, /* 1/4 usec */ + IEEE80211_HT_MPDU_DENSITY_0_5 = 2, /* 1/2 usec */ + IEEE80211_HT_MPDU_DENSITY_1 = 3, /* 1 usec */ + IEEE80211_HT_MPDU_DENSITY_2 = 4, /* 2 usec */ + IEEE80211_HT_MPDU_DENSITY_4 = 5, /* 4 usec */ + IEEE80211_HT_MPDU_DENSITY_8 = 6, /* 8 usec */ + IEEE80211_HT_MPDU_DENSITY_16 = 7 /* 16 usec */ +}; + +/** + * struct ieee80211_ht_operation - HT operation IE + * @primary_chan: Primary Channel + * @ht_param: HT Operation Information parameters + * @operation_mode: HT Operation Information operation mode + * @stbc_param: HT Operation Information STBC params + * @basic_set: Basic HT-MCS Set + * + * This structure represents the payload of the "HT Operation + * element" as described in IEEE Std 802.11-2020 section 9.4.2.56. + */ +struct ieee80211_ht_operation { + u8 primary_chan; + u8 ht_param; + __le16 operation_mode; + __le16 stbc_param; + u8 basic_set[16]; +} __packed; + +/* for ht_param */ +#define IEEE80211_HT_PARAM_CHA_SEC_OFFSET 0x03 +#define IEEE80211_HT_PARAM_CHA_SEC_NONE 0x00 +#define IEEE80211_HT_PARAM_CHA_SEC_ABOVE 0x01 +#define IEEE80211_HT_PARAM_CHA_SEC_BELOW 0x03 +#define IEEE80211_HT_PARAM_CHAN_WIDTH_ANY 0x04 +#define IEEE80211_HT_PARAM_RIFS_MODE 0x08 + +/* for operation_mode */ +#define IEEE80211_HT_OP_MODE_PROTECTION 0x0003 +#define IEEE80211_HT_OP_MODE_PROTECTION_NONE 0 +#define IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER 1 +#define IEEE80211_HT_OP_MODE_PROTECTION_20MHZ 2 +#define IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED 3 +#define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT 0x0004 +#define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT 0x0010 +#define IEEE80211_HT_OP_MODE_CCFS2_SHIFT 5 +#define IEEE80211_HT_OP_MODE_CCFS2_MASK 0x1fe0 + +/* for stbc_param */ +#define IEEE80211_HT_STBC_PARAM_DUAL_BEACON 0x0040 +#define IEEE80211_HT_STBC_PARAM_DUAL_CTS_PROT 0x0080 +#define IEEE80211_HT_STBC_PARAM_STBC_BEACON 0x0100 +#define IEEE80211_HT_STBC_PARAM_LSIG_TXOP_FULLPROT 0x0200 +#define IEEE80211_HT_STBC_PARAM_PCO_ACTIVE 0x0400 +#define IEEE80211_HT_STBC_PARAM_PCO_PHASE 0x0800 + + +/* block-ack parameters */ +#define IEEE80211_ADDBA_PARAM_AMSDU_MASK 0x0001 +#define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002 +#define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C +#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFC0 +#define IEEE80211_DELBA_PARAM_TID_MASK 0xF000 +#define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800 + +/* + * A-MPDU buffer sizes + * According to HT size varies from 8 to 64 frames + * HE adds the ability to have up to 256 frames. + * EHT adds the ability to have up to 1K frames. + */ +#define IEEE80211_MIN_AMPDU_BUF 0x8 +#define IEEE80211_MAX_AMPDU_BUF_HT 0x40 +#define IEEE80211_MAX_AMPDU_BUF_HE 0x100 +#define IEEE80211_MAX_AMPDU_BUF_EHT 0x400 + + +/* Spatial Multiplexing Power Save Modes (for capability) */ +#define WLAN_HT_CAP_SM_PS_STATIC 0 +#define WLAN_HT_CAP_SM_PS_DYNAMIC 1 +#define WLAN_HT_CAP_SM_PS_INVALID 2 +#define WLAN_HT_CAP_SM_PS_DISABLED 3 + +/* for SM power control field lower two bits */ +#define WLAN_HT_SMPS_CONTROL_DISABLED 0 +#define WLAN_HT_SMPS_CONTROL_STATIC 1 +#define WLAN_HT_SMPS_CONTROL_DYNAMIC 3 + +/* HT action codes */ +enum ieee80211_ht_actioncode { + WLAN_HT_ACTION_NOTIFY_CHANWIDTH = 0, + WLAN_HT_ACTION_SMPS = 1, + WLAN_HT_ACTION_PSMP = 2, + WLAN_HT_ACTION_PCO_PHASE = 3, + WLAN_HT_ACTION_CSI = 4, + WLAN_HT_ACTION_NONCOMPRESSED_BF = 5, + WLAN_HT_ACTION_COMPRESSED_BF = 6, + WLAN_HT_ACTION_ASEL_IDX_FEEDBACK = 7, +}; + +/* BACK action code */ +enum ieee80211_back_actioncode { + WLAN_ACTION_ADDBA_REQ = 0, + WLAN_ACTION_ADDBA_RESP = 1, + WLAN_ACTION_DELBA = 2, +}; + +/* BACK (block-ack) parties */ +enum ieee80211_back_parties { + WLAN_BACK_RECIPIENT = 0, + WLAN_BACK_INITIATOR = 1, +}; + +#endif /* LINUX_IEEE80211_HT_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index fe78b150ab45..0a9b4a8025cd 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -239,13 +239,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */ #define IEEE80211_MAX_FRAME_LEN 2352 -/* Maximal size of an A-MSDU that can be transported in a HT BA session */ -#define IEEE80211_MAX_MPDU_LEN_HT_BA 4095 - -/* Maximal size of an A-MSDU */ -#define IEEE80211_MAX_MPDU_LEN_HT_3839 3839 -#define IEEE80211_MAX_MPDU_LEN_HT_7935 7935 - #define IEEE80211_MAX_MPDU_LEN_VHT_3895 3895 #define IEEE80211_MAX_MPDU_LEN_VHT_7991 7991 #define IEEE80211_MAX_MPDU_LEN_VHT_11454 11454 @@ -302,8 +295,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK 0x03 #define IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT 5 -#define IEEE80211_HT_CTL_LEN 4 - /* trigger type within common_info of trigger frame */ #define IEEE80211_TRIGGER_TYPE_MASK 0xf #define IEEE80211_TRIGGER_TYPE_BASIC 0x0 @@ -997,11 +988,6 @@ struct ieee80211_tim_ie { }; } __packed; -enum ieee80211_ht_chanwidth_values { - IEEE80211_HT_CHANWIDTH_20MHZ = 0, - IEEE80211_HT_CHANWIDTH_ANY = 1, -}; - /** * enum ieee80211_vht_opmode_bits - VHT operating mode field bits * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask @@ -1677,146 +1663,6 @@ struct ieee80211_p2p_noa_attr { #define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F -/** - * struct ieee80211_bar - Block Ack Request frame format - * @frame_control: Frame Control - * @duration: Duration - * @ra: RA - * @ta: TA - * @control: BAR Control - * @start_seq_num: Starting Sequence Number (see Figure 9-37) - * - * This structure represents the "BlockAckReq frame format" - * as described in IEEE Std 802.11-2020 section 9.3.1.7. -*/ -struct ieee80211_bar { - __le16 frame_control; - __le16 duration; - __u8 ra[ETH_ALEN]; - __u8 ta[ETH_ALEN]; - __le16 control; - __le16 start_seq_num; -} __packed; - -/* 802.11 BAR control masks */ -#define IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL 0x0000 -#define IEEE80211_BAR_CTRL_MULTI_TID 0x0002 -#define IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA 0x0004 -#define IEEE80211_BAR_CTRL_TID_INFO_MASK 0xf000 -#define IEEE80211_BAR_CTRL_TID_INFO_SHIFT 12 - -#define IEEE80211_HT_MCS_MASK_LEN 10 - -/** - * struct ieee80211_mcs_info - Supported MCS Set field - * @rx_mask: RX mask - * @rx_highest: highest supported RX rate. If set represents - * the highest supported RX data rate in units of 1 Mbps. - * If this field is 0 this value should not be used to - * consider the highest RX data rate supported. - * @tx_params: TX parameters - * @reserved: Reserved bits - * - * This structure represents the "Supported MCS Set field" as - * described in IEEE Std 802.11-2020 section 9.4.2.55.4. - */ -struct ieee80211_mcs_info { - u8 rx_mask[IEEE80211_HT_MCS_MASK_LEN]; - __le16 rx_highest; - u8 tx_params; - u8 reserved[3]; -} __packed; - -/* 802.11n HT capability MSC set */ -#define IEEE80211_HT_MCS_RX_HIGHEST_MASK 0x3ff -#define IEEE80211_HT_MCS_TX_DEFINED 0x01 -#define IEEE80211_HT_MCS_TX_RX_DIFF 0x02 -/* value 0 == 1 stream etc */ -#define IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK 0x0C -#define IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT 2 -#define IEEE80211_HT_MCS_TX_MAX_STREAMS 4 -#define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION 0x10 - -#define IEEE80211_HT_MCS_CHAINS(mcs) ((mcs) == 32 ? 1 : (1 + ((mcs) >> 3))) - -/* - * 802.11n D5.0 20.3.5 / 20.6 says: - * - indices 0 to 7 and 32 are single spatial stream - * - 8 to 31 are multiple spatial streams using equal modulation - * [8..15 for two streams, 16..23 for three and 24..31 for four] - * - remainder are multiple spatial streams using unequal modulation - */ -#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START 33 -#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE \ - (IEEE80211_HT_MCS_UNEQUAL_MODULATION_START / 8) - -/** - * struct ieee80211_ht_cap - HT capabilities element - * @cap_info: HT Capability Information - * @ampdu_params_info: A-MPDU Parameters - * @mcs: Supported MCS Set - * @extended_ht_cap_info: HT Extended Capabilities - * @tx_BF_cap_info: Transmit Beamforming Capabilities - * @antenna_selection_info: ASEL Capability - * - * This structure represents the payload of the "HT Capabilities - * element" as described in IEEE Std 802.11-2020 section 9.4.2.55. - */ -struct ieee80211_ht_cap { - __le16 cap_info; - u8 ampdu_params_info; - - /* 16 bytes MCS information */ - struct ieee80211_mcs_info mcs; - - __le16 extended_ht_cap_info; - __le32 tx_BF_cap_info; - u8 antenna_selection_info; -} __packed; - -/* 802.11n HT capabilities masks (for cap_info) */ -#define IEEE80211_HT_CAP_LDPC_CODING 0x0001 -#define IEEE80211_HT_CAP_SUP_WIDTH_20_40 0x0002 -#define IEEE80211_HT_CAP_SM_PS 0x000C -#define IEEE80211_HT_CAP_SM_PS_SHIFT 2 -#define IEEE80211_HT_CAP_GRN_FLD 0x0010 -#define IEEE80211_HT_CAP_SGI_20 0x0020 -#define IEEE80211_HT_CAP_SGI_40 0x0040 -#define IEEE80211_HT_CAP_TX_STBC 0x0080 -#define IEEE80211_HT_CAP_RX_STBC 0x0300 -#define IEEE80211_HT_CAP_RX_STBC_SHIFT 8 -#define IEEE80211_HT_CAP_DELAY_BA 0x0400 -#define IEEE80211_HT_CAP_MAX_AMSDU 0x0800 -#define IEEE80211_HT_CAP_DSSSCCK40 0x1000 -#define IEEE80211_HT_CAP_RESERVED 0x2000 -#define IEEE80211_HT_CAP_40MHZ_INTOLERANT 0x4000 -#define IEEE80211_HT_CAP_LSIG_TXOP_PROT 0x8000 - -/* 802.11n HT extended capabilities masks (for extended_ht_cap_info) */ -#define IEEE80211_HT_EXT_CAP_PCO 0x0001 -#define IEEE80211_HT_EXT_CAP_PCO_TIME 0x0006 -#define IEEE80211_HT_EXT_CAP_PCO_TIME_SHIFT 1 -#define IEEE80211_HT_EXT_CAP_MCS_FB 0x0300 -#define IEEE80211_HT_EXT_CAP_MCS_FB_SHIFT 8 -#define IEEE80211_HT_EXT_CAP_HTC_SUP 0x0400 -#define IEEE80211_HT_EXT_CAP_RD_RESPONDER 0x0800 - -/* 802.11n HT capability AMPDU settings (for ampdu_params_info) */ -#define IEEE80211_HT_AMPDU_PARM_FACTOR 0x03 -#define IEEE80211_HT_AMPDU_PARM_DENSITY 0x1C -#define IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT 2 - -/* - * Maximum length of AMPDU that the STA can receive in high-throughput (HT). - * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets) - */ -enum ieee80211_max_ampdu_length_exp { - IEEE80211_HT_MAX_AMPDU_8K = 0, - IEEE80211_HT_MAX_AMPDU_16K = 1, - IEEE80211_HT_MAX_AMPDU_32K = 2, - IEEE80211_HT_MAX_AMPDU_64K = 3 -}; - /* * Maximum length of AMPDU that the STA can receive in VHT. * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets) @@ -1832,98 +1678,6 @@ enum ieee80211_vht_max_ampdu_length_exp { IEEE80211_VHT_MAX_AMPDU_1024K = 7 }; -#define IEEE80211_HT_MAX_AMPDU_FACTOR 13 - -/* Minimum MPDU start spacing */ -enum ieee80211_min_mpdu_spacing { - IEEE80211_HT_MPDU_DENSITY_NONE = 0, /* No restriction */ - IEEE80211_HT_MPDU_DENSITY_0_25 = 1, /* 1/4 usec */ - IEEE80211_HT_MPDU_DENSITY_0_5 = 2, /* 1/2 usec */ - IEEE80211_HT_MPDU_DENSITY_1 = 3, /* 1 usec */ - IEEE80211_HT_MPDU_DENSITY_2 = 4, /* 2 usec */ - IEEE80211_HT_MPDU_DENSITY_4 = 5, /* 4 usec */ - IEEE80211_HT_MPDU_DENSITY_8 = 6, /* 8 usec */ - IEEE80211_HT_MPDU_DENSITY_16 = 7 /* 16 usec */ -}; - -/** - * struct ieee80211_ht_operation - HT operation IE - * @primary_chan: Primary Channel - * @ht_param: HT Operation Information parameters - * @operation_mode: HT Operation Information operation mode - * @stbc_param: HT Operation Information STBC params - * @basic_set: Basic HT-MCS Set - * - * This structure represents the payload of the "HT Operation - * element" as described in IEEE Std 802.11-2020 section 9.4.2.56. - */ -struct ieee80211_ht_operation { - u8 primary_chan; - u8 ht_param; - __le16 operation_mode; - __le16 stbc_param; - u8 basic_set[16]; -} __packed; - -/* for ht_param */ -#define IEEE80211_HT_PARAM_CHA_SEC_OFFSET 0x03 -#define IEEE80211_HT_PARAM_CHA_SEC_NONE 0x00 -#define IEEE80211_HT_PARAM_CHA_SEC_ABOVE 0x01 -#define IEEE80211_HT_PARAM_CHA_SEC_BELOW 0x03 -#define IEEE80211_HT_PARAM_CHAN_WIDTH_ANY 0x04 -#define IEEE80211_HT_PARAM_RIFS_MODE 0x08 - -/* for operation_mode */ -#define IEEE80211_HT_OP_MODE_PROTECTION 0x0003 -#define IEEE80211_HT_OP_MODE_PROTECTION_NONE 0 -#define IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER 1 -#define IEEE80211_HT_OP_MODE_PROTECTION_20MHZ 2 -#define IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED 3 -#define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT 0x0004 -#define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT 0x0010 -#define IEEE80211_HT_OP_MODE_CCFS2_SHIFT 5 -#define IEEE80211_HT_OP_MODE_CCFS2_MASK 0x1fe0 - -/* for stbc_param */ -#define IEEE80211_HT_STBC_PARAM_DUAL_BEACON 0x0040 -#define IEEE80211_HT_STBC_PARAM_DUAL_CTS_PROT 0x0080 -#define IEEE80211_HT_STBC_PARAM_STBC_BEACON 0x0100 -#define IEEE80211_HT_STBC_PARAM_LSIG_TXOP_FULLPROT 0x0200 -#define IEEE80211_HT_STBC_PARAM_PCO_ACTIVE 0x0400 -#define IEEE80211_HT_STBC_PARAM_PCO_PHASE 0x0800 - - -/* block-ack parameters */ -#define IEEE80211_ADDBA_PARAM_AMSDU_MASK 0x0001 -#define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002 -#define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C -#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFC0 -#define IEEE80211_DELBA_PARAM_TID_MASK 0xF000 -#define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800 - -/* - * A-MPDU buffer sizes - * According to HT size varies from 8 to 64 frames - * HE adds the ability to have up to 256 frames. - * EHT adds the ability to have up to 1K frames. - */ -#define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF_HT 0x40 -#define IEEE80211_MAX_AMPDU_BUF_HE 0x100 -#define IEEE80211_MAX_AMPDU_BUF_EHT 0x400 - - -/* Spatial Multiplexing Power Save Modes (for capability) */ -#define WLAN_HT_CAP_SM_PS_STATIC 0 -#define WLAN_HT_CAP_SM_PS_DYNAMIC 1 -#define WLAN_HT_CAP_SM_PS_INVALID 2 -#define WLAN_HT_CAP_SM_PS_DISABLED 3 - -/* for SM power control field lower two bits */ -#define WLAN_HT_SMPS_CONTROL_DISABLED 0 -#define WLAN_HT_SMPS_CONTROL_STATIC 1 -#define WLAN_HT_SMPS_CONTROL_DYNAMIC 3 - /** * struct ieee80211_vht_mcs_info - VHT MCS information * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams @@ -3807,18 +3561,6 @@ enum ieee80211_spectrum_mgmt_actioncode { WLAN_ACTION_SPCT_CHL_SWITCH = 4, }; -/* HT action codes */ -enum ieee80211_ht_actioncode { - WLAN_HT_ACTION_NOTIFY_CHANWIDTH = 0, - WLAN_HT_ACTION_SMPS = 1, - WLAN_HT_ACTION_PSMP = 2, - WLAN_HT_ACTION_PCO_PHASE = 3, - WLAN_HT_ACTION_CSI = 4, - WLAN_HT_ACTION_NONCOMPRESSED_BF = 5, - WLAN_HT_ACTION_COMPRESSED_BF = 6, - WLAN_HT_ACTION_ASEL_IDX_FEEDBACK = 7, -}; - /* VHT action codes */ enum ieee80211_vht_actioncode { WLAN_VHT_ACTION_COMPRESSED_BF = 0, @@ -4155,19 +3897,6 @@ struct ieee80211_bss_max_idle_period_ie { u8 idle_options; } __packed; -/* BACK action code */ -enum ieee80211_back_actioncode { - WLAN_ACTION_ADDBA_REQ = 0, - WLAN_ACTION_ADDBA_RESP = 1, - WLAN_ACTION_DELBA = 2, -}; - -/* BACK (block-ack) parties */ -enum ieee80211_back_parties { - WLAN_BACK_RECIPIENT = 0, - WLAN_BACK_INITIATOR = 1, -}; - /* SA Query action */ enum ieee80211_sa_query_action { WLAN_ACTION_SA_QUERY_REQUEST = 0, @@ -5889,6 +5618,7 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) #define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 #define NAN_DEV_CAPA_S3_SUPPORTED 0x10 +#include "ieee80211-ht.h" #include "ieee80211-mesh.h" #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From 7cb14da1d7bbfa4a6417ed7f1bc07dd77bcd9c83 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:51 +0100 Subject: wifi: ieee80211: split VHT definitions out The ieee80211.h file has gotten very long, continue splitting it by putting VHT definitions into a separate file. Link: https://patch.msgid.link/20251105153843.c31cb771a250.I787a13064db7d80440101de3445be17881daf1b6@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-vht.h | 236 ++++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 216 +------------------------------------- 2 files changed, 237 insertions(+), 215 deletions(-) create mode 100644 include/linux/ieee80211-vht.h (limited to 'include/linux') diff --git a/include/linux/ieee80211-vht.h b/include/linux/ieee80211-vht.h new file mode 100644 index 000000000000..898dfb561fef --- /dev/null +++ b/include/linux/ieee80211-vht.h @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 VHT definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_VHT_H +#define LINUX_IEEE80211_VHT_H + +#include +#include + +#define IEEE80211_MAX_MPDU_LEN_VHT_3895 3895 +#define IEEE80211_MAX_MPDU_LEN_VHT_7991 7991 +#define IEEE80211_MAX_MPDU_LEN_VHT_11454 11454 + +/** + * enum ieee80211_vht_opmode_bits - VHT operating mode field bits + * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask + * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: 20 MHz channel width + * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width + * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width + * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width + * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag + * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask + * (the NSS value is the value of this field + 1) + * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift + * @IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF: indicates streams in SU-MIMO PPDU + * using a beamforming steering matrix + */ +enum ieee80211_vht_opmode_bits { + IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK = 0x03, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ = 0, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ = 1, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ = 2, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ = 3, + IEEE80211_OPMODE_NOTIF_BW_160_80P80 = 0x04, + IEEE80211_OPMODE_NOTIF_RX_NSS_MASK = 0x70, + IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT = 4, + IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF = 0x80, +}; + +/* + * Maximum length of AMPDU that the STA can receive in VHT. + * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets) + */ +enum ieee80211_vht_max_ampdu_length_exp { + IEEE80211_VHT_MAX_AMPDU_8K = 0, + IEEE80211_VHT_MAX_AMPDU_16K = 1, + IEEE80211_VHT_MAX_AMPDU_32K = 2, + IEEE80211_VHT_MAX_AMPDU_64K = 3, + IEEE80211_VHT_MAX_AMPDU_128K = 4, + IEEE80211_VHT_MAX_AMPDU_256K = 5, + IEEE80211_VHT_MAX_AMPDU_512K = 6, + IEEE80211_VHT_MAX_AMPDU_1024K = 7 +}; + +/** + * struct ieee80211_vht_mcs_info - VHT MCS information + * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams + * @rx_highest: Indicates highest long GI VHT PPDU data rate + * STA can receive. Rate expressed in units of 1 Mbps. + * If this field is 0 this value should not be used to + * consider the highest RX data rate supported. + * The top 3 bits of this field indicate the Maximum NSTS,total + * (a beamformee capability.) + * @tx_mcs_map: TX MCS map 2 bits for each stream, total 8 streams + * @tx_highest: Indicates highest long GI VHT PPDU data rate + * STA can transmit. Rate expressed in units of 1 Mbps. + * If this field is 0 this value should not be used to + * consider the highest TX data rate supported. + * The top 2 bits of this field are reserved, the + * 3rd bit from the top indiciates VHT Extended NSS BW + * Capability. + */ +struct ieee80211_vht_mcs_info { + __le16 rx_mcs_map; + __le16 rx_highest; + __le16 tx_mcs_map; + __le16 tx_highest; +} __packed; + +/* for rx_highest */ +#define IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT 13 +#define IEEE80211_VHT_MAX_NSTS_TOTAL_MASK (7 << IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT) + +/* for tx_highest */ +#define IEEE80211_VHT_EXT_NSS_BW_CAPABLE (1 << 13) + +/** + * enum ieee80211_vht_mcs_support - VHT MCS support definitions + * @IEEE80211_VHT_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the + * number of streams + * @IEEE80211_VHT_MCS_SUPPORT_0_8: MCSes 0-8 are supported + * @IEEE80211_VHT_MCS_SUPPORT_0_9: MCSes 0-9 are supported + * @IEEE80211_VHT_MCS_NOT_SUPPORTED: This number of streams isn't supported + * + * These definitions are used in each 2-bit subfield of the @rx_mcs_map + * and @tx_mcs_map fields of &struct ieee80211_vht_mcs_info, which are + * both split into 8 subfields by number of streams. These values indicate + * which MCSes are supported for the number of streams the value appears + * for. + */ +enum ieee80211_vht_mcs_support { + IEEE80211_VHT_MCS_SUPPORT_0_7 = 0, + IEEE80211_VHT_MCS_SUPPORT_0_8 = 1, + IEEE80211_VHT_MCS_SUPPORT_0_9 = 2, + IEEE80211_VHT_MCS_NOT_SUPPORTED = 3, +}; + +/** + * struct ieee80211_vht_cap - VHT capabilities + * + * This structure is the "VHT capabilities element" as + * described in 802.11ac D3.0 8.4.2.160 + * @vht_cap_info: VHT capability info + * @supp_mcs: VHT MCS supported rates + */ +struct ieee80211_vht_cap { + __le32 vht_cap_info; + struct ieee80211_vht_mcs_info supp_mcs; +} __packed; + +/** + * enum ieee80211_vht_chanwidth - VHT channel width + * @IEEE80211_VHT_CHANWIDTH_USE_HT: use the HT operation IE to + * determine the channel width (20 or 40 MHz) + * @IEEE80211_VHT_CHANWIDTH_80MHZ: 80 MHz bandwidth + * @IEEE80211_VHT_CHANWIDTH_160MHZ: 160 MHz bandwidth + * @IEEE80211_VHT_CHANWIDTH_80P80MHZ: 80+80 MHz bandwidth + */ +enum ieee80211_vht_chanwidth { + IEEE80211_VHT_CHANWIDTH_USE_HT = 0, + IEEE80211_VHT_CHANWIDTH_80MHZ = 1, + IEEE80211_VHT_CHANWIDTH_160MHZ = 2, + IEEE80211_VHT_CHANWIDTH_80P80MHZ = 3, +}; + +/** + * struct ieee80211_vht_operation - VHT operation IE + * + * This structure is the "VHT operation element" as + * described in 802.11ac D3.0 8.4.2.161 + * @chan_width: Operating channel width + * @center_freq_seg0_idx: center freq segment 0 index + * @center_freq_seg1_idx: center freq segment 1 index + * @basic_mcs_set: VHT Basic MCS rate set + */ +struct ieee80211_vht_operation { + u8 chan_width; + u8 center_freq_seg0_idx; + u8 center_freq_seg1_idx; + __le16 basic_mcs_set; +} __packed; + +/* 802.11ac VHT Capabilities */ +#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 +#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 0x00000001 +#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 0x00000002 +#define IEEE80211_VHT_CAP_MAX_MPDU_MASK 0x00000003 +#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ 0x00000004 +#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ 0x00000008 +#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK 0x0000000C +#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_SHIFT 2 +#define IEEE80211_VHT_CAP_RXLDPC 0x00000010 +#define IEEE80211_VHT_CAP_SHORT_GI_80 0x00000020 +#define IEEE80211_VHT_CAP_SHORT_GI_160 0x00000040 +#define IEEE80211_VHT_CAP_TXSTBC 0x00000080 +#define IEEE80211_VHT_CAP_RXSTBC_1 0x00000100 +#define IEEE80211_VHT_CAP_RXSTBC_2 0x00000200 +#define IEEE80211_VHT_CAP_RXSTBC_3 0x00000300 +#define IEEE80211_VHT_CAP_RXSTBC_4 0x00000400 +#define IEEE80211_VHT_CAP_RXSTBC_MASK 0x00000700 +#define IEEE80211_VHT_CAP_RXSTBC_SHIFT 8 +#define IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE 0x00000800 +#define IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE 0x00001000 +#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT 13 +#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK \ + (7 << IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT) +#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT 16 +#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK \ + (7 << IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT) +#define IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE 0x00080000 +#define IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE 0x00100000 +#define IEEE80211_VHT_CAP_VHT_TXOP_PS 0x00200000 +#define IEEE80211_VHT_CAP_HTC_VHT 0x00400000 +#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT 23 +#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK \ + (7 << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT) +#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB 0x08000000 +#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB 0x0c000000 +#define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN 0x10000000 +#define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN 0x20000000 +#define IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT 30 +#define IEEE80211_VHT_CAP_EXT_NSS_BW_MASK 0xc0000000 + +/** + * ieee80211_get_vht_max_nss - return max NSS for a given bandwidth/MCS + * @cap: VHT capabilities of the peer + * @bw: bandwidth to use + * @mcs: MCS index to use + * @ext_nss_bw_capable: indicates whether or not the local transmitter + * (rate scaling algorithm) can deal with the new logic + * (dot11VHTExtendedNSSBWCapable) + * @max_vht_nss: current maximum NSS as advertised by the STA in + * operating mode notification, can be 0 in which case the + * capability data will be used to derive this (from MCS support) + * Return: The maximum NSS that can be used for the given bandwidth/MCS + * combination + * + * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can + * vary for a given BW/MCS. This function parses the data. + * + * Note: This function is exported by cfg80211. + */ +int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, + enum ieee80211_vht_chanwidth bw, + int mcs, bool ext_nss_bw_capable, + unsigned int max_vht_nss); + +/* VHT action codes */ +enum ieee80211_vht_actioncode { + WLAN_VHT_ACTION_COMPRESSED_BF = 0, + WLAN_VHT_ACTION_GROUPID_MGMT = 1, + WLAN_VHT_ACTION_OPMODE_NOTIF = 2, +}; + +#endif /* LINUX_IEEE80211_VHT_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0a9b4a8025cd..0b247b28c661 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -239,10 +239,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */ #define IEEE80211_MAX_FRAME_LEN 2352 -#define IEEE80211_MAX_MPDU_LEN_VHT_3895 3895 -#define IEEE80211_MAX_MPDU_LEN_VHT_7991 7991 -#define IEEE80211_MAX_MPDU_LEN_VHT_11454 11454 - #define IEEE80211_MAX_SSID_LEN 32 #define IEEE80211_FIRST_TSPEC_TSID 8 @@ -988,32 +984,6 @@ struct ieee80211_tim_ie { }; } __packed; -/** - * enum ieee80211_vht_opmode_bits - VHT operating mode field bits - * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask - * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: 20 MHz channel width - * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width - * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width - * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width - * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag - * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask - * (the NSS value is the value of this field + 1) - * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift - * @IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF: indicates streams in SU-MIMO PPDU - * using a beamforming steering matrix - */ -enum ieee80211_vht_opmode_bits { - IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK = 0x03, - IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ = 0, - IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ = 1, - IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ = 2, - IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ = 3, - IEEE80211_OPMODE_NOTIF_BW_160_80P80 = 0x04, - IEEE80211_OPMODE_NOTIF_RX_NSS_MASK = 0x70, - IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT = 4, - IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF = 0x80, -}; - /** * enum ieee80211_s1g_chanwidth - S1G channel widths * These are defined in IEEE802.11-2016ah Table 10-20 @@ -1663,119 +1633,6 @@ struct ieee80211_p2p_noa_attr { #define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F -/* - * Maximum length of AMPDU that the STA can receive in VHT. - * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets) - */ -enum ieee80211_vht_max_ampdu_length_exp { - IEEE80211_VHT_MAX_AMPDU_8K = 0, - IEEE80211_VHT_MAX_AMPDU_16K = 1, - IEEE80211_VHT_MAX_AMPDU_32K = 2, - IEEE80211_VHT_MAX_AMPDU_64K = 3, - IEEE80211_VHT_MAX_AMPDU_128K = 4, - IEEE80211_VHT_MAX_AMPDU_256K = 5, - IEEE80211_VHT_MAX_AMPDU_512K = 6, - IEEE80211_VHT_MAX_AMPDU_1024K = 7 -}; - -/** - * struct ieee80211_vht_mcs_info - VHT MCS information - * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams - * @rx_highest: Indicates highest long GI VHT PPDU data rate - * STA can receive. Rate expressed in units of 1 Mbps. - * If this field is 0 this value should not be used to - * consider the highest RX data rate supported. - * The top 3 bits of this field indicate the Maximum NSTS,total - * (a beamformee capability.) - * @tx_mcs_map: TX MCS map 2 bits for each stream, total 8 streams - * @tx_highest: Indicates highest long GI VHT PPDU data rate - * STA can transmit. Rate expressed in units of 1 Mbps. - * If this field is 0 this value should not be used to - * consider the highest TX data rate supported. - * The top 2 bits of this field are reserved, the - * 3rd bit from the top indiciates VHT Extended NSS BW - * Capability. - */ -struct ieee80211_vht_mcs_info { - __le16 rx_mcs_map; - __le16 rx_highest; - __le16 tx_mcs_map; - __le16 tx_highest; -} __packed; - -/* for rx_highest */ -#define IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT 13 -#define IEEE80211_VHT_MAX_NSTS_TOTAL_MASK (7 << IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT) - -/* for tx_highest */ -#define IEEE80211_VHT_EXT_NSS_BW_CAPABLE (1 << 13) - -/** - * enum ieee80211_vht_mcs_support - VHT MCS support definitions - * @IEEE80211_VHT_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the - * number of streams - * @IEEE80211_VHT_MCS_SUPPORT_0_8: MCSes 0-8 are supported - * @IEEE80211_VHT_MCS_SUPPORT_0_9: MCSes 0-9 are supported - * @IEEE80211_VHT_MCS_NOT_SUPPORTED: This number of streams isn't supported - * - * These definitions are used in each 2-bit subfield of the @rx_mcs_map - * and @tx_mcs_map fields of &struct ieee80211_vht_mcs_info, which are - * both split into 8 subfields by number of streams. These values indicate - * which MCSes are supported for the number of streams the value appears - * for. - */ -enum ieee80211_vht_mcs_support { - IEEE80211_VHT_MCS_SUPPORT_0_7 = 0, - IEEE80211_VHT_MCS_SUPPORT_0_8 = 1, - IEEE80211_VHT_MCS_SUPPORT_0_9 = 2, - IEEE80211_VHT_MCS_NOT_SUPPORTED = 3, -}; - -/** - * struct ieee80211_vht_cap - VHT capabilities - * - * This structure is the "VHT capabilities element" as - * described in 802.11ac D3.0 8.4.2.160 - * @vht_cap_info: VHT capability info - * @supp_mcs: VHT MCS supported rates - */ -struct ieee80211_vht_cap { - __le32 vht_cap_info; - struct ieee80211_vht_mcs_info supp_mcs; -} __packed; - -/** - * enum ieee80211_vht_chanwidth - VHT channel width - * @IEEE80211_VHT_CHANWIDTH_USE_HT: use the HT operation IE to - * determine the channel width (20 or 40 MHz) - * @IEEE80211_VHT_CHANWIDTH_80MHZ: 80 MHz bandwidth - * @IEEE80211_VHT_CHANWIDTH_160MHZ: 160 MHz bandwidth - * @IEEE80211_VHT_CHANWIDTH_80P80MHZ: 80+80 MHz bandwidth - */ -enum ieee80211_vht_chanwidth { - IEEE80211_VHT_CHANWIDTH_USE_HT = 0, - IEEE80211_VHT_CHANWIDTH_80MHZ = 1, - IEEE80211_VHT_CHANWIDTH_160MHZ = 2, - IEEE80211_VHT_CHANWIDTH_80P80MHZ = 3, -}; - -/** - * struct ieee80211_vht_operation - VHT operation IE - * - * This structure is the "VHT operation element" as - * described in 802.11ac D3.0 8.4.2.161 - * @chan_width: Operating channel width - * @center_freq_seg0_idx: center freq segment 0 index - * @center_freq_seg1_idx: center freq segment 1 index - * @basic_mcs_set: VHT Basic MCS rate set - */ -struct ieee80211_vht_operation { - u8 chan_width; - u8 center_freq_seg0_idx; - u8 center_freq_seg1_idx; - __le16 basic_mcs_set; -} __packed; - /** * struct ieee80211_he_cap_elem - HE capabilities element * @mac_cap_info: HE MAC Capabilities Information @@ -2045,71 +1902,6 @@ struct ieee80211_eht_operation_info { u8 optional[]; } __packed; -/* 802.11ac VHT Capabilities */ -#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 -#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 0x00000001 -#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 0x00000002 -#define IEEE80211_VHT_CAP_MAX_MPDU_MASK 0x00000003 -#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ 0x00000004 -#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ 0x00000008 -#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK 0x0000000C -#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_SHIFT 2 -#define IEEE80211_VHT_CAP_RXLDPC 0x00000010 -#define IEEE80211_VHT_CAP_SHORT_GI_80 0x00000020 -#define IEEE80211_VHT_CAP_SHORT_GI_160 0x00000040 -#define IEEE80211_VHT_CAP_TXSTBC 0x00000080 -#define IEEE80211_VHT_CAP_RXSTBC_1 0x00000100 -#define IEEE80211_VHT_CAP_RXSTBC_2 0x00000200 -#define IEEE80211_VHT_CAP_RXSTBC_3 0x00000300 -#define IEEE80211_VHT_CAP_RXSTBC_4 0x00000400 -#define IEEE80211_VHT_CAP_RXSTBC_MASK 0x00000700 -#define IEEE80211_VHT_CAP_RXSTBC_SHIFT 8 -#define IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE 0x00000800 -#define IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE 0x00001000 -#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT 13 -#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK \ - (7 << IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT) -#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT 16 -#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK \ - (7 << IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT) -#define IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE 0x00080000 -#define IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE 0x00100000 -#define IEEE80211_VHT_CAP_VHT_TXOP_PS 0x00200000 -#define IEEE80211_VHT_CAP_HTC_VHT 0x00400000 -#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT 23 -#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK \ - (7 << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT) -#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB 0x08000000 -#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB 0x0c000000 -#define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN 0x10000000 -#define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN 0x20000000 -#define IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT 30 -#define IEEE80211_VHT_CAP_EXT_NSS_BW_MASK 0xc0000000 - -/** - * ieee80211_get_vht_max_nss - return max NSS for a given bandwidth/MCS - * @cap: VHT capabilities of the peer - * @bw: bandwidth to use - * @mcs: MCS index to use - * @ext_nss_bw_capable: indicates whether or not the local transmitter - * (rate scaling algorithm) can deal with the new logic - * (dot11VHTExtendedNSSBWCapable) - * @max_vht_nss: current maximum NSS as advertised by the STA in - * operating mode notification, can be 0 in which case the - * capability data will be used to derive this (from MCS support) - * Return: The maximum NSS that can be used for the given bandwidth/MCS - * combination - * - * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can - * vary for a given BW/MCS. This function parses the data. - * - * Note: This function is exported by cfg80211. - */ -int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, - enum ieee80211_vht_chanwidth bw, - int mcs, bool ext_nss_bw_capable, - unsigned int max_vht_nss); - /* 802.11ax HE MAC capabilities */ #define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 #define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 @@ -3561,13 +3353,6 @@ enum ieee80211_spectrum_mgmt_actioncode { WLAN_ACTION_SPCT_CHL_SWITCH = 4, }; -/* VHT action codes */ -enum ieee80211_vht_actioncode { - WLAN_VHT_ACTION_COMPRESSED_BF = 0, - WLAN_VHT_ACTION_GROUPID_MGMT = 1, - WLAN_VHT_ACTION_OPMODE_NOTIF = 2, -}; - /* Self Protected Action codes */ enum ieee80211_self_protected_actioncode { WLAN_SP_RESERVED = 0, @@ -5619,6 +5404,7 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) #define NAN_DEV_CAPA_S3_SUPPORTED 0x10 #include "ieee80211-ht.h" +#include "ieee80211-vht.h" #include "ieee80211-mesh.h" #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From 02a2cf302557eb59794bba0b05d6755f44928d78 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:52 +0100 Subject: wifi: ieee80211: split HE definitions out The ieee80211.h file has gotten very long, continue splitting it by putting HE definitions into a separate file. Link: https://patch.msgid.link/20251105153843.6998c0802104.I3dd7cfea6abbd118b999ecdedd48437d39cb0533@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-he.h | 824 +++++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 806 +----------------------------------------- 2 files changed, 827 insertions(+), 803 deletions(-) create mode 100644 include/linux/ieee80211-he.h (limited to 'include/linux') diff --git a/include/linux/ieee80211-he.h b/include/linux/ieee80211-he.h new file mode 100644 index 000000000000..904d50db5bb8 --- /dev/null +++ b/include/linux/ieee80211-he.h @@ -0,0 +1,824 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 HE definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_HE_H +#define LINUX_IEEE80211_HE_H + +#include +#include + +#define IEEE80211_TWT_CONTROL_NDP BIT(0) +#define IEEE80211_TWT_CONTROL_RESP_MODE BIT(1) +#define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST BIT(3) +#define IEEE80211_TWT_CONTROL_RX_DISABLED BIT(4) +#define IEEE80211_TWT_CONTROL_WAKE_DUR_UNIT BIT(5) + +#define IEEE80211_TWT_REQTYPE_REQUEST BIT(0) +#define IEEE80211_TWT_REQTYPE_SETUP_CMD GENMASK(3, 1) +#define IEEE80211_TWT_REQTYPE_TRIGGER BIT(4) +#define IEEE80211_TWT_REQTYPE_IMPLICIT BIT(5) +#define IEEE80211_TWT_REQTYPE_FLOWTYPE BIT(6) +#define IEEE80211_TWT_REQTYPE_FLOWID GENMASK(9, 7) +#define IEEE80211_TWT_REQTYPE_WAKE_INT_EXP GENMASK(14, 10) +#define IEEE80211_TWT_REQTYPE_PROTECTION BIT(15) + +enum ieee80211_twt_setup_cmd { + TWT_SETUP_CMD_REQUEST, + TWT_SETUP_CMD_SUGGEST, + TWT_SETUP_CMD_DEMAND, + TWT_SETUP_CMD_GROUPING, + TWT_SETUP_CMD_ACCEPT, + TWT_SETUP_CMD_ALTERNATE, + TWT_SETUP_CMD_DICTATE, + TWT_SETUP_CMD_REJECT, +}; + +struct ieee80211_twt_params { + __le16 req_type; + __le64 twt; + u8 min_twt_dur; + __le16 mantissa; + u8 channel; +} __packed; + +struct ieee80211_twt_setup { + u8 dialog_token; + u8 element_id; + u8 length; + u8 control; + u8 params[]; +} __packed; + +/** + * struct ieee80211_he_cap_elem - HE capabilities element + * @mac_cap_info: HE MAC Capabilities Information + * @phy_cap_info: HE PHY Capabilities Information + * + * This structure represents the fixed fields of the payload of the + * "HE capabilities element" as described in IEEE Std 802.11ax-2021 + * sections 9.4.2.248.2 and 9.4.2.248.3. + */ +struct ieee80211_he_cap_elem { + u8 mac_cap_info[6]; + u8 phy_cap_info[11]; +} __packed; + +#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN 5 + +/** + * enum ieee80211_he_mcs_support - HE MCS support definitions + * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the + * number of streams + * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported + * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported + * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported + * + * These definitions are used in each 2-bit subfield of the rx_mcs_* + * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are + * both split into 8 subfields by number of streams. These values indicate + * which MCSes are supported for the number of streams the value appears + * for. + */ +enum ieee80211_he_mcs_support { + IEEE80211_HE_MCS_SUPPORT_0_7 = 0, + IEEE80211_HE_MCS_SUPPORT_0_9 = 1, + IEEE80211_HE_MCS_SUPPORT_0_11 = 2, + IEEE80211_HE_MCS_NOT_SUPPORTED = 3, +}; + +/** + * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field + * + * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field + * described in P802.11ax_D2.0 section 9.4.2.237.4 + * + * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel + * widths less than 80MHz. + * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel + * widths less than 80MHz. + * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel + * width 160MHz. + * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel + * width 160MHz. + * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for + * channel width 80p80MHz. + * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for + * channel width 80p80MHz. + */ +struct ieee80211_he_mcs_nss_supp { + __le16 rx_mcs_80; + __le16 tx_mcs_80; + __le16 rx_mcs_160; + __le16 tx_mcs_160; + __le16 rx_mcs_80p80; + __le16 tx_mcs_80p80; +} __packed; + +/** + * struct ieee80211_he_operation - HE Operation element + * @he_oper_params: HE Operation Parameters + BSS Color Information + * @he_mcs_nss_set: Basic HE-MCS And NSS Set + * @optional: Optional fields VHT Operation Information, Max Co-Hosted + * BSSID Indicator, and 6 GHz Operation Information + * + * This structure represents the payload of the "HE Operation + * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.249. + */ +struct ieee80211_he_operation { + __le32 he_oper_params; + __le16 he_mcs_nss_set; + u8 optional[]; +} __packed; + +/** + * struct ieee80211_he_spr - Spatial Reuse Parameter Set element + * @he_sr_control: SR Control + * @optional: Optional fields Non-SRG OBSS PD Max Offset, SRG OBSS PD + * Min Offset, SRG OBSS PD Max Offset, SRG BSS Color + * Bitmap, and SRG Partial BSSID Bitmap + * + * This structure represents the payload of the "Spatial Reuse + * Parameter Set element" as described in IEEE Std 802.11ax-2021 + * section 9.4.2.252. + */ +struct ieee80211_he_spr { + u8 he_sr_control; + u8 optional[]; +} __packed; + +/** + * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field + * @aifsn: ACI/AIFSN + * @ecw_min_max: ECWmin/ECWmax + * @mu_edca_timer: MU EDCA Timer + * + * This structure represents the "MU AC Parameter Record" as described + * in IEEE Std 802.11ax-2021 section 9.4.2.251, Figure 9-788p. + */ +struct ieee80211_he_mu_edca_param_ac_rec { + u8 aifsn; + u8 ecw_min_max; + u8 mu_edca_timer; +} __packed; + +/** + * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element + * @mu_qos_info: QoS Info + * @ac_be: MU AC_BE Parameter Record + * @ac_bk: MU AC_BK Parameter Record + * @ac_vi: MU AC_VI Parameter Record + * @ac_vo: MU AC_VO Parameter Record + * + * This structure represents the payload of the "MU EDCA Parameter Set + * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.251. + */ +struct ieee80211_mu_edca_param_set { + u8 mu_qos_info; + struct ieee80211_he_mu_edca_param_ac_rec ac_be; + struct ieee80211_he_mu_edca_param_ac_rec ac_bk; + struct ieee80211_he_mu_edca_param_ac_rec ac_vi; + struct ieee80211_he_mu_edca_param_ac_rec ac_vo; +} __packed; + +/* 802.11ax HE MAC capabilities */ +#define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 +#define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 +#define IEEE80211_HE_MAC_CAP0_TWT_RES 0x04 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP 0x00 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1 0x08 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2 0x10 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3 0x18 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK 0x18 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1 0x00 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2 0x20 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4 0x40 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8 0x60 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16 0x80 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32 0xa0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64 0xc0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED 0xe0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK 0xe0 + +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED 0x00 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128 0x01 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256 0x02 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512 0x03 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK 0x03 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US 0x00 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US 0x04 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US 0x08 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK 0x0c +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_1 0x00 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_2 0x10 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_3 0x20 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_4 0x30 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_5 0x40 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_6 0x50 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_7 0x60 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8 0x70 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_MASK 0x70 + +/* Link adaptation is split between byte HE_MAC_CAP1 and + * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE + * in which case the following values apply: + * 0 = No feedback. + * 1 = reserved. + * 2 = Unsolicited feedback. + * 3 = both + */ +#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION 0x80 + +#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION 0x01 +#define IEEE80211_HE_MAC_CAP2_ALL_ACK 0x02 +#define IEEE80211_HE_MAC_CAP2_TRS 0x04 +#define IEEE80211_HE_MAC_CAP2_BSR 0x08 +#define IEEE80211_HE_MAC_CAP2_BCAST_TWT 0x10 +#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP 0x20 +#define IEEE80211_HE_MAC_CAP2_MU_CASCADING 0x40 +#define IEEE80211_HE_MAC_CAP2_ACK_EN 0x80 + +#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL 0x02 +#define IEEE80211_HE_MAC_CAP3_OFDMA_RA 0x04 + +/* The maximum length of an A-MDPU is defined by the combination of the Maximum + * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the + * same field in the HE capabilities. + */ +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0 0x00 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1 0x08 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2 0x10 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3 0x18 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK 0x18 +#define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG 0x20 +#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED 0x40 +#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS 0x80 + +#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 +#define IEEE80211_HE_MAC_CAP4_QTP 0x02 +#define IEEE80211_HE_MAC_CAP4_BQR 0x04 +#define IEEE80211_HE_MAC_CAP4_PSR_RESP 0x08 +#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP 0x10 +#define IEEE80211_HE_MAC_CAP4_OPS 0x20 +#define IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU 0x40 +/* Multi TID agg TX is split between byte #4 and #5 + * The value is a combination of B39,B40,B41 + */ +#define IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39 0x80 + +#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40 0x01 +#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41 0x02 +#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION 0x04 +#define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU 0x08 +#define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX 0x10 +#define IEEE80211_HE_MAC_CAP5_HE_DYNAMIC_SM_PS 0x20 +#define IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING 0x40 +#define IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX 0x80 + +#define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR 20 +#define IEEE80211_HE_HT_MAX_AMPDU_FACTOR 16 +#define IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR 13 + +/* 802.11ax HE PHY capabilities */ +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G 0x02 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G 0x08 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G 0x10 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL 0x1e + +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G 0x20 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G 0x40 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK 0xfe + +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ 0x01 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ 0x02 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ 0x04 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ 0x08 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK 0x0f +#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A 0x10 +#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD 0x20 +#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US 0x40 +/* Midamble RX/TX Max NSTS is split between byte #2 and byte #3 */ +#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS 0x80 + +#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_TX_MAX_NSTS 0x01 +#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US 0x02 +#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ 0x04 +#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ 0x08 +#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX 0x10 +#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX 0x20 + +/* Note that the meaning of UL MU below is different between an AP and a non-AP + * sta, where in the AP case it indicates support for Rx and in the non-AP sta + * case it indicates support for Tx. + */ +#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO 0x40 +#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO 0x80 + +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK 0x01 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK 0x02 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM 0x03 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK 0x03 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2 0x04 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK 0x08 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK 0x10 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM 0x18 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK 0x18 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2 0x20 +#define IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU 0x40 +#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER 0x80 + +#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE 0x01 +#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER 0x02 + +/* Minimal allowed value of Max STS under 80MHz is 3 */ +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4 0x0c +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5 0x10 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6 0x14 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7 0x18 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8 0x1c +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK 0x1c + +/* Minimal allowed value of Max STS above 80MHz is 3 */ +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4 0x60 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5 0x80 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6 0xa0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7 0xc0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8 0xe0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK 0xe0 + +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1 0x00 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2 0x01 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3 0x02 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4 0x03 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5 0x04 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6 0x05 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7 0x06 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8 0x07 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK 0x07 + +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1 0x00 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2 0x08 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3 0x10 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4 0x18 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5 0x20 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6 0x28 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7 0x30 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8 0x38 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK 0x38 + +#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK 0x40 +#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK 0x80 + +#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU 0x01 +#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU 0x02 +#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB 0x04 +#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB 0x08 +#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB 0x10 +#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE 0x20 +#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO 0x40 +#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT 0x80 + +#define IEEE80211_HE_PHY_CAP7_PSR_BASED_SR 0x01 +#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP 0x02 +#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI 0x04 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_1 0x08 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_2 0x10 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_3 0x18 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_4 0x20 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_5 0x28 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_6 0x30 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_7 0x38 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK 0x38 +#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ 0x40 +#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ 0x80 + +#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI 0x01 +#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G 0x02 +#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU 0x04 +#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU 0x08 +#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI 0x10 +#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_TX_2X_AND_1XLTF 0x20 +#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242 0x00 +#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484 0x40 +#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996 0x80 +#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996 0xc0 +#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK 0xc0 + +#define IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM 0x01 +#define IEEE80211_HE_PHY_CAP9_NON_TRIGGERED_CQI_FEEDBACK 0x02 +#define IEEE80211_HE_PHY_CAP9_TX_1024_QAM_LESS_THAN_242_TONE_RU 0x04 +#define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU 0x08 +#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB 0x10 +#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB 0x20 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US 0x0 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US 0x1 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US 0x2 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED 0x3 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_POS 6 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK 0xc0 + +#define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF 0x01 + +/* 802.11ax HE TX/RX MCS NSS Support */ +#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS (3) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS (6) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS (11) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK 0x07c0 +#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK 0xf800 + +/* TX/RX HE MCS Support field Highest MCS subfield encoding */ +enum ieee80211_he_highest_mcs_supported_subfield_enc { + HIGHEST_MCS_SUPPORTED_MCS7 = 0, + HIGHEST_MCS_SUPPORTED_MCS8, + HIGHEST_MCS_SUPPORTED_MCS9, + HIGHEST_MCS_SUPPORTED_MCS10, + HIGHEST_MCS_SUPPORTED_MCS11, +}; + +/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */ +static inline u8 +ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap) +{ + u8 count = 4; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + count += 4; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) + count += 4; + + return count; +} + +/* 802.11ax HE PPE Thresholds */ +#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS (1) +#define IEEE80211_PPE_THRES_NSS_POS (0) +#define IEEE80211_PPE_THRES_NSS_MASK (7) +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU \ + (BIT(5) | BIT(6)) +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK 0x78 +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS (3) +#define IEEE80211_PPE_THRES_INFO_PPET_SIZE (3) +#define IEEE80211_HE_PPE_THRES_INFO_HEADER_SIZE (7) + +/* + * Calculate 802.11ax HE capabilities IE PPE field size + * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8* + */ +static inline u8 +ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) +{ + u8 n; + + if ((phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) + return 0; + + n = hweight8(ppe_thres_hdr & + IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >> + IEEE80211_PPE_THRES_NSS_POS)); + + /* + * Each pair is 6 bits, and we need to add the 7 "header" bits to the + * total size. + */ + n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; + n = DIV_ROUND_UP(n, 8); + + return n; +} + +static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_he_cap_elem *he_cap_ie_elem = (const void *)data; + u8 needed = sizeof(*he_cap_ie_elem); + + if (len < needed) + return false; + + needed += ieee80211_he_mcs_nss_size(he_cap_ie_elem); + if (len < needed) + return false; + + if (he_cap_ie_elem->phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) { + if (len < needed + 1) + return false; + needed += ieee80211_he_ppe_size(data[needed], + he_cap_ie_elem->phy_cap_info); + } + + return len >= needed; +} + +/* HE Operation defines */ +#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x00000007 +#define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000008 +#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK 0x00003ff0 +#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET 4 +#define IEEE80211_HE_OPERATION_VHT_OPER_INFO 0x00004000 +#define IEEE80211_HE_OPERATION_CO_HOSTED_BSS 0x00008000 +#define IEEE80211_HE_OPERATION_ER_SU_DISABLE 0x00010000 +#define IEEE80211_HE_OPERATION_6GHZ_OP_INFO 0x00020000 +#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK 0x3f000000 +#define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET 24 +#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x40000000 +#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED 0x80000000 + +#define IEEE80211_6GHZ_CTRL_REG_LPI_AP 0 +#define IEEE80211_6GHZ_CTRL_REG_SP_AP 1 +#define IEEE80211_6GHZ_CTRL_REG_VLP_AP 2 +#define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP 3 +#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD 4 +#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP 8 + +/** + * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field + * @primary: primary channel + * @control: control flags + * @ccfs0: channel center frequency segment 0 + * @ccfs1: channel center frequency segment 1 + * @minrate: minimum rate (in 1 Mbps units) + */ +struct ieee80211_he_6ghz_oper { + u8 primary; +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH 0x3 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ 0 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ 1 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ 2 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ 3 +#define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON 0x4 +#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO 0x78 + u8 control; + u8 ccfs0; + u8 ccfs1; + u8 minrate; +} __packed; + +/** + * enum ieee80211_reg_conn_bits - represents Regulatory connectivity field bits. + * + * This enumeration defines bit flags used to represent regulatory connectivity + * field bits. + * + * @IEEE80211_REG_CONN_LPI_VALID: Indicates whether the LPI bit is valid. + * @IEEE80211_REG_CONN_LPI_VALUE: Represents the value of the LPI bit. + * @IEEE80211_REG_CONN_SP_VALID: Indicates whether the SP bit is valid. + * @IEEE80211_REG_CONN_SP_VALUE: Represents the value of the SP bit. + */ +enum ieee80211_reg_conn_bits { + IEEE80211_REG_CONN_LPI_VALID = BIT(0), + IEEE80211_REG_CONN_LPI_VALUE = BIT(1), + IEEE80211_REG_CONN_SP_VALID = BIT(2), + IEEE80211_REG_CONN_SP_VALUE = BIT(3), +}; + +/* transmit power interpretation type of transmit power envelope element */ +enum ieee80211_tx_power_intrpt_type { + IEEE80211_TPE_LOCAL_EIRP, + IEEE80211_TPE_LOCAL_EIRP_PSD, + IEEE80211_TPE_REG_CLIENT_EIRP, + IEEE80211_TPE_REG_CLIENT_EIRP_PSD, +}; + +/* category type of transmit power envelope element */ +enum ieee80211_tx_power_category_6ghz { + IEEE80211_TPE_CAT_6GHZ_DEFAULT = 0, + IEEE80211_TPE_CAT_6GHZ_SUBORDINATE = 1, +}; + +/* + * For IEEE80211_TPE_LOCAL_EIRP / IEEE80211_TPE_REG_CLIENT_EIRP, + * setting to 63.5 dBm means no constraint. + */ +#define IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT 127 + +/* + * For IEEE80211_TPE_LOCAL_EIRP_PSD / IEEE80211_TPE_REG_CLIENT_EIRP_PSD, + * setting to 127 indicates no PSD limit for the 20 MHz channel. + */ +#define IEEE80211_TPE_PSD_NO_LIMIT 127 + +/** + * struct ieee80211_tx_pwr_env - Transmit Power Envelope + * @info: Transmit Power Information field + * @variable: Maximum Transmit Power field + * + * This structure represents the payload of the "Transmit Power + * Envelope element" as described in IEEE Std 802.11ax-2021 section + * 9.4.2.161 + */ +struct ieee80211_tx_pwr_env { + u8 info; + u8 variable[]; +} __packed; + +#define IEEE80211_TX_PWR_ENV_INFO_COUNT 0x7 +#define IEEE80211_TX_PWR_ENV_INFO_INTERPRET 0x38 +#define IEEE80211_TX_PWR_ENV_INFO_CATEGORY 0xC0 + +#define IEEE80211_TX_PWR_ENV_EXT_COUNT 0xF + +static inline bool ieee80211_valid_tpe_element(const u8 *data, u8 len) +{ + const struct ieee80211_tx_pwr_env *env = (const void *)data; + u8 count, interpret, category; + u8 needed = sizeof(*env); + u8 N; /* also called N in the spec */ + + if (len < needed) + return false; + + count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT); + interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET); + category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY); + + switch (category) { + case IEEE80211_TPE_CAT_6GHZ_DEFAULT: + case IEEE80211_TPE_CAT_6GHZ_SUBORDINATE: + break; + default: + return false; + } + + switch (interpret) { + case IEEE80211_TPE_LOCAL_EIRP: + case IEEE80211_TPE_REG_CLIENT_EIRP: + if (count > 3) + return false; + + /* count == 0 encodes 1 value for 20 MHz, etc. */ + needed += count + 1; + + if (len < needed) + return false; + + /* there can be extension fields not accounted for in 'count' */ + + return true; + case IEEE80211_TPE_LOCAL_EIRP_PSD: + case IEEE80211_TPE_REG_CLIENT_EIRP_PSD: + if (count > 4) + return false; + + N = count ? 1 << (count - 1) : 1; + needed += N; + + if (len < needed) + return false; + + if (len > needed) { + u8 K = u8_get_bits(env->variable[N], + IEEE80211_TX_PWR_ENV_EXT_COUNT); + + needed += 1 + K; + if (len < needed) + return false; + } + + return true; + } + + return false; +} + +/* + * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size + * @he_oper_ie: byte data of the He Operations IE, stating from the byte + * after the ext ID byte. It is assumed that he_oper_ie has at least + * sizeof(struct ieee80211_he_operation) bytes, the caller must have + * validated this. + * @return the actual size of the IE data (not including header), or 0 on error + */ +static inline u8 +ieee80211_he_oper_size(const u8 *he_oper_ie) +{ + const struct ieee80211_he_operation *he_oper = (const void *)he_oper_ie; + u8 oper_len = sizeof(struct ieee80211_he_operation); + u32 he_oper_params; + + /* Make sure the input is not NULL */ + if (!he_oper_ie) + return 0; + + /* Calc required length */ + he_oper_params = le32_to_cpu(he_oper->he_oper_params); + if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) + oper_len += 3; + if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) + oper_len++; + if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO) + oper_len += sizeof(struct ieee80211_he_6ghz_oper); + + /* Add the first byte (extension ID) to the total length */ + oper_len++; + + return oper_len; +} + +/** + * ieee80211_he_6ghz_oper - obtain 6 GHz operation field + * @he_oper: HE operation element (must be pre-validated for size) + * but may be %NULL + * + * Return: a pointer to the 6 GHz operation field, or %NULL + */ +static inline const struct ieee80211_he_6ghz_oper * +ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) +{ + const u8 *ret; + u32 he_oper_params; + + if (!he_oper) + return NULL; + + ret = (const void *)&he_oper->optional; + + he_oper_params = le32_to_cpu(he_oper->he_oper_params); + + if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)) + return NULL; + if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) + ret += 3; + if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) + ret++; + + return (const void *)ret; +} + +/* HE Spatial Reuse defines */ +#define IEEE80211_HE_SPR_PSR_DISALLOWED BIT(0) +#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED BIT(1) +#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT BIT(2) +#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT BIT(3) +#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED BIT(4) + +/* + * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size + * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the byte + * after the ext ID byte. It is assumed that he_spr_ie has at least + * sizeof(struct ieee80211_he_spr) bytes, the caller must have validated + * this + * @return the actual size of the IE data (not including header), or 0 on error + */ +static inline u8 +ieee80211_he_spr_size(const u8 *he_spr_ie) +{ + const struct ieee80211_he_spr *he_spr = (const void *)he_spr_ie; + u8 spr_len = sizeof(struct ieee80211_he_spr); + u8 he_spr_params; + + /* Make sure the input is not NULL */ + if (!he_spr_ie) + return 0; + + /* Calc required length */ + he_spr_params = he_spr->he_sr_control; + if (he_spr_params & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT) + spr_len++; + if (he_spr_params & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT) + spr_len += 18; + + /* Add the first byte (extension ID) to the total length */ + spr_len++; + + return spr_len; +} + +struct ieee80211_he_6ghz_capa { + /* uses IEEE80211_HE_6GHZ_CAP_* below */ + __le16 capa; +} __packed; + +/* HE 6 GHz band capabilities */ +/* uses enum ieee80211_min_mpdu_spacing values */ +#define IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START 0x0007 +/* uses enum ieee80211_vht_max_ampdu_length_exp values */ +#define IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP 0x0038 +/* uses IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_* values */ +#define IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN 0x00c0 +/* WLAN_HT_CAP_SM_PS_* values */ +#define IEEE80211_HE_6GHZ_CAP_SM_PS 0x0600 +#define IEEE80211_HE_6GHZ_CAP_RD_RESPONDER 0x0800 +#define IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS 0x1000 +#define IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS 0x2000 + +#endif /* LINUX_IEEE80211_HE_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0b247b28c661..a3dbbcee00ee 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1141,48 +1141,6 @@ ieee80211_s1g_optional_len(__le16 fc) return len; } -#define IEEE80211_TWT_CONTROL_NDP BIT(0) -#define IEEE80211_TWT_CONTROL_RESP_MODE BIT(1) -#define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST BIT(3) -#define IEEE80211_TWT_CONTROL_RX_DISABLED BIT(4) -#define IEEE80211_TWT_CONTROL_WAKE_DUR_UNIT BIT(5) - -#define IEEE80211_TWT_REQTYPE_REQUEST BIT(0) -#define IEEE80211_TWT_REQTYPE_SETUP_CMD GENMASK(3, 1) -#define IEEE80211_TWT_REQTYPE_TRIGGER BIT(4) -#define IEEE80211_TWT_REQTYPE_IMPLICIT BIT(5) -#define IEEE80211_TWT_REQTYPE_FLOWTYPE BIT(6) -#define IEEE80211_TWT_REQTYPE_FLOWID GENMASK(9, 7) -#define IEEE80211_TWT_REQTYPE_WAKE_INT_EXP GENMASK(14, 10) -#define IEEE80211_TWT_REQTYPE_PROTECTION BIT(15) - -enum ieee80211_twt_setup_cmd { - TWT_SETUP_CMD_REQUEST, - TWT_SETUP_CMD_SUGGEST, - TWT_SETUP_CMD_DEMAND, - TWT_SETUP_CMD_GROUPING, - TWT_SETUP_CMD_ACCEPT, - TWT_SETUP_CMD_ALTERNATE, - TWT_SETUP_CMD_DICTATE, - TWT_SETUP_CMD_REJECT, -}; - -struct ieee80211_twt_params { - __le16 req_type; - __le64 twt; - u8 min_twt_dur; - __le16 mantissa; - u8 channel; -} __packed; - -struct ieee80211_twt_setup { - u8 dialog_token; - u8 element_id; - u8 length; - u8 control; - u8 params[]; -} __packed; - #define IEEE80211_TTLM_MAX_CNT 2 #define IEEE80211_TTLM_CONTROL_DIRECTION 0x03 #define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP 0x04 @@ -1633,137 +1591,6 @@ struct ieee80211_p2p_noa_attr { #define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F -/** - * struct ieee80211_he_cap_elem - HE capabilities element - * @mac_cap_info: HE MAC Capabilities Information - * @phy_cap_info: HE PHY Capabilities Information - * - * This structure represents the fixed fields of the payload of the - * "HE capabilities element" as described in IEEE Std 802.11ax-2021 - * sections 9.4.2.248.2 and 9.4.2.248.3. - */ -struct ieee80211_he_cap_elem { - u8 mac_cap_info[6]; - u8 phy_cap_info[11]; -} __packed; - -#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN 5 - -/** - * enum ieee80211_he_mcs_support - HE MCS support definitions - * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the - * number of streams - * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported - * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported - * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported - * - * These definitions are used in each 2-bit subfield of the rx_mcs_* - * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are - * both split into 8 subfields by number of streams. These values indicate - * which MCSes are supported for the number of streams the value appears - * for. - */ -enum ieee80211_he_mcs_support { - IEEE80211_HE_MCS_SUPPORT_0_7 = 0, - IEEE80211_HE_MCS_SUPPORT_0_9 = 1, - IEEE80211_HE_MCS_SUPPORT_0_11 = 2, - IEEE80211_HE_MCS_NOT_SUPPORTED = 3, -}; - -/** - * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field - * - * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field - * described in P802.11ax_D2.0 section 9.4.2.237.4 - * - * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel - * widths less than 80MHz. - * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel - * widths less than 80MHz. - * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel - * width 160MHz. - * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel - * width 160MHz. - * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for - * channel width 80p80MHz. - * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for - * channel width 80p80MHz. - */ -struct ieee80211_he_mcs_nss_supp { - __le16 rx_mcs_80; - __le16 tx_mcs_80; - __le16 rx_mcs_160; - __le16 tx_mcs_160; - __le16 rx_mcs_80p80; - __le16 tx_mcs_80p80; -} __packed; - -/** - * struct ieee80211_he_operation - HE Operation element - * @he_oper_params: HE Operation Parameters + BSS Color Information - * @he_mcs_nss_set: Basic HE-MCS And NSS Set - * @optional: Optional fields VHT Operation Information, Max Co-Hosted - * BSSID Indicator, and 6 GHz Operation Information - * - * This structure represents the payload of the "HE Operation - * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.249. - */ -struct ieee80211_he_operation { - __le32 he_oper_params; - __le16 he_mcs_nss_set; - u8 optional[]; -} __packed; - -/** - * struct ieee80211_he_spr - Spatial Reuse Parameter Set element - * @he_sr_control: SR Control - * @optional: Optional fields Non-SRG OBSS PD Max Offset, SRG OBSS PD - * Min Offset, SRG OBSS PD Max Offset, SRG BSS Color - * Bitmap, and SRG Partial BSSID Bitmap - * - * This structure represents the payload of the "Spatial Reuse - * Parameter Set element" as described in IEEE Std 802.11ax-2021 - * section 9.4.2.252. - */ -struct ieee80211_he_spr { - u8 he_sr_control; - u8 optional[]; -} __packed; - -/** - * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field - * @aifsn: ACI/AIFSN - * @ecw_min_max: ECWmin/ECWmax - * @mu_edca_timer: MU EDCA Timer - * - * This structure represents the "MU AC Parameter Record" as described - * in IEEE Std 802.11ax-2021 section 9.4.2.251, Figure 9-788p. - */ -struct ieee80211_he_mu_edca_param_ac_rec { - u8 aifsn; - u8 ecw_min_max; - u8 mu_edca_timer; -} __packed; - -/** - * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element - * @mu_qos_info: QoS Info - * @ac_be: MU AC_BE Parameter Record - * @ac_bk: MU AC_BK Parameter Record - * @ac_vi: MU AC_VI Parameter Record - * @ac_vo: MU AC_VO Parameter Record - * - * This structure represents the payload of the "MU EDCA Parameter Set - * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.251. - */ -struct ieee80211_mu_edca_param_set { - u8 mu_qos_info; - struct ieee80211_he_mu_edca_param_ac_rec ac_be; - struct ieee80211_he_mu_edca_param_ac_rec ac_bk; - struct ieee80211_he_mu_edca_param_ac_rec ac_vi; - struct ieee80211_he_mu_edca_param_ac_rec ac_vo; -} __packed; - #define IEEE80211_EHT_MCS_NSS_RX 0x0f #define IEEE80211_EHT_MCS_NSS_TX 0xf0 @@ -1902,618 +1729,6 @@ struct ieee80211_eht_operation_info { u8 optional[]; } __packed; -/* 802.11ax HE MAC capabilities */ -#define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 -#define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 -#define IEEE80211_HE_MAC_CAP0_TWT_RES 0x04 -#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP 0x00 -#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1 0x08 -#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2 0x10 -#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3 0x18 -#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK 0x18 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1 0x00 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2 0x20 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4 0x40 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8 0x60 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16 0x80 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32 0xa0 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64 0xc0 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED 0xe0 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK 0xe0 - -#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED 0x00 -#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128 0x01 -#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256 0x02 -#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512 0x03 -#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK 0x03 -#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US 0x00 -#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US 0x04 -#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US 0x08 -#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK 0x0c -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_1 0x00 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_2 0x10 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_3 0x20 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_4 0x30 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_5 0x40 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_6 0x50 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_7 0x60 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8 0x70 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_MASK 0x70 - -/* Link adaptation is split between byte HE_MAC_CAP1 and - * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE - * in which case the following values apply: - * 0 = No feedback. - * 1 = reserved. - * 2 = Unsolicited feedback. - * 3 = both - */ -#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION 0x80 - -#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION 0x01 -#define IEEE80211_HE_MAC_CAP2_ALL_ACK 0x02 -#define IEEE80211_HE_MAC_CAP2_TRS 0x04 -#define IEEE80211_HE_MAC_CAP2_BSR 0x08 -#define IEEE80211_HE_MAC_CAP2_BCAST_TWT 0x10 -#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP 0x20 -#define IEEE80211_HE_MAC_CAP2_MU_CASCADING 0x40 -#define IEEE80211_HE_MAC_CAP2_ACK_EN 0x80 - -#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL 0x02 -#define IEEE80211_HE_MAC_CAP3_OFDMA_RA 0x04 - -/* The maximum length of an A-MDPU is defined by the combination of the Maximum - * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the - * same field in the HE capabilities. - */ -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0 0x00 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1 0x08 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2 0x10 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3 0x18 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK 0x18 -#define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG 0x20 -#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED 0x40 -#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS 0x80 - -#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 -#define IEEE80211_HE_MAC_CAP4_QTP 0x02 -#define IEEE80211_HE_MAC_CAP4_BQR 0x04 -#define IEEE80211_HE_MAC_CAP4_PSR_RESP 0x08 -#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP 0x10 -#define IEEE80211_HE_MAC_CAP4_OPS 0x20 -#define IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU 0x40 -/* Multi TID agg TX is split between byte #4 and #5 - * The value is a combination of B39,B40,B41 - */ -#define IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39 0x80 - -#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40 0x01 -#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41 0x02 -#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION 0x04 -#define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU 0x08 -#define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX 0x10 -#define IEEE80211_HE_MAC_CAP5_HE_DYNAMIC_SM_PS 0x20 -#define IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING 0x40 -#define IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX 0x80 - -#define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR 20 -#define IEEE80211_HE_HT_MAX_AMPDU_FACTOR 16 -#define IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR 13 - -/* 802.11ax HE PHY capabilities */ -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G 0x02 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G 0x08 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G 0x10 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL 0x1e - -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G 0x20 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G 0x40 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK 0xfe - -#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ 0x01 -#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ 0x02 -#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ 0x04 -#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ 0x08 -#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK 0x0f -#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A 0x10 -#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD 0x20 -#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US 0x40 -/* Midamble RX/TX Max NSTS is split between byte #2 and byte #3 */ -#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS 0x80 - -#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_TX_MAX_NSTS 0x01 -#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US 0x02 -#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ 0x04 -#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ 0x08 -#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX 0x10 -#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX 0x20 - -/* Note that the meaning of UL MU below is different between an AP and a non-AP - * sta, where in the AP case it indicates support for Rx and in the non-AP sta - * case it indicates support for Tx. - */ -#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO 0x40 -#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO 0x80 - -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM 0x00 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK 0x01 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK 0x02 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM 0x03 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK 0x03 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1 0x00 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2 0x04 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM 0x00 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK 0x08 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK 0x10 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM 0x18 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK 0x18 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1 0x00 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2 0x20 -#define IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU 0x40 -#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER 0x80 - -#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE 0x01 -#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER 0x02 - -/* Minimal allowed value of Max STS under 80MHz is 3 */ -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4 0x0c -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5 0x10 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6 0x14 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7 0x18 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8 0x1c -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK 0x1c - -/* Minimal allowed value of Max STS above 80MHz is 3 */ -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4 0x60 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5 0x80 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6 0xa0 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7 0xc0 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8 0xe0 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK 0xe0 - -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1 0x00 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2 0x01 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3 0x02 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4 0x03 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5 0x04 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6 0x05 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7 0x06 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8 0x07 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK 0x07 - -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1 0x00 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2 0x08 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3 0x10 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4 0x18 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5 0x20 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6 0x28 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7 0x30 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8 0x38 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK 0x38 - -#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK 0x40 -#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK 0x80 - -#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU 0x01 -#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU 0x02 -#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB 0x04 -#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB 0x08 -#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB 0x10 -#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE 0x20 -#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO 0x40 -#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT 0x80 - -#define IEEE80211_HE_PHY_CAP7_PSR_BASED_SR 0x01 -#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP 0x02 -#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI 0x04 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_1 0x08 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_2 0x10 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_3 0x18 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_4 0x20 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_5 0x28 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_6 0x30 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_7 0x38 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK 0x38 -#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ 0x40 -#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ 0x80 - -#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI 0x01 -#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G 0x02 -#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU 0x04 -#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU 0x08 -#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI 0x10 -#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_TX_2X_AND_1XLTF 0x20 -#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242 0x00 -#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484 0x40 -#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996 0x80 -#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996 0xc0 -#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK 0xc0 - -#define IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM 0x01 -#define IEEE80211_HE_PHY_CAP9_NON_TRIGGERED_CQI_FEEDBACK 0x02 -#define IEEE80211_HE_PHY_CAP9_TX_1024_QAM_LESS_THAN_242_TONE_RU 0x04 -#define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU 0x08 -#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB 0x10 -#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB 0x20 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US 0x0 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US 0x1 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US 0x2 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED 0x3 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_POS 6 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK 0xc0 - -#define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF 0x01 - -/* 802.11ax HE TX/RX MCS NSS Support */ -#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS (3) -#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS (6) -#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS (11) -#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK 0x07c0 -#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK 0xf800 - -/* TX/RX HE MCS Support field Highest MCS subfield encoding */ -enum ieee80211_he_highest_mcs_supported_subfield_enc { - HIGHEST_MCS_SUPPORTED_MCS7 = 0, - HIGHEST_MCS_SUPPORTED_MCS8, - HIGHEST_MCS_SUPPORTED_MCS9, - HIGHEST_MCS_SUPPORTED_MCS10, - HIGHEST_MCS_SUPPORTED_MCS11, -}; - -/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */ -static inline u8 -ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap) -{ - u8 count = 4; - - if (he_cap->phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) - count += 4; - - if (he_cap->phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) - count += 4; - - return count; -} - -/* 802.11ax HE PPE Thresholds */ -#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS (1) -#define IEEE80211_PPE_THRES_NSS_POS (0) -#define IEEE80211_PPE_THRES_NSS_MASK (7) -#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU \ - (BIT(5) | BIT(6)) -#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK 0x78 -#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS (3) -#define IEEE80211_PPE_THRES_INFO_PPET_SIZE (3) -#define IEEE80211_HE_PPE_THRES_INFO_HEADER_SIZE (7) - -/* - * Calculate 802.11ax HE capabilities IE PPE field size - * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8* - */ -static inline u8 -ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) -{ - u8 n; - - if ((phy_cap_info[6] & - IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) - return 0; - - n = hweight8(ppe_thres_hdr & - IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); - n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >> - IEEE80211_PPE_THRES_NSS_POS)); - - /* - * Each pair is 6 bits, and we need to add the 7 "header" bits to the - * total size. - */ - n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; - n = DIV_ROUND_UP(n, 8); - - return n; -} - -static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) -{ - const struct ieee80211_he_cap_elem *he_cap_ie_elem = (const void *)data; - u8 needed = sizeof(*he_cap_ie_elem); - - if (len < needed) - return false; - - needed += ieee80211_he_mcs_nss_size(he_cap_ie_elem); - if (len < needed) - return false; - - if (he_cap_ie_elem->phy_cap_info[6] & - IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) { - if (len < needed + 1) - return false; - needed += ieee80211_he_ppe_size(data[needed], - he_cap_ie_elem->phy_cap_info); - } - - return len >= needed; -} - -/* HE Operation defines */ -#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x00000007 -#define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000008 -#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK 0x00003ff0 -#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET 4 -#define IEEE80211_HE_OPERATION_VHT_OPER_INFO 0x00004000 -#define IEEE80211_HE_OPERATION_CO_HOSTED_BSS 0x00008000 -#define IEEE80211_HE_OPERATION_ER_SU_DISABLE 0x00010000 -#define IEEE80211_HE_OPERATION_6GHZ_OP_INFO 0x00020000 -#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK 0x3f000000 -#define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET 24 -#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x40000000 -#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED 0x80000000 - -#define IEEE80211_6GHZ_CTRL_REG_LPI_AP 0 -#define IEEE80211_6GHZ_CTRL_REG_SP_AP 1 -#define IEEE80211_6GHZ_CTRL_REG_VLP_AP 2 -#define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP 3 -#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD 4 -#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP 8 - -/** - * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field - * @primary: primary channel - * @control: control flags - * @ccfs0: channel center frequency segment 0 - * @ccfs1: channel center frequency segment 1 - * @minrate: minimum rate (in 1 Mbps units) - */ -struct ieee80211_he_6ghz_oper { - u8 primary; -#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH 0x3 -#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ 0 -#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ 1 -#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ 2 -#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ 3 -#define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON 0x4 -#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO 0x78 - u8 control; - u8 ccfs0; - u8 ccfs1; - u8 minrate; -} __packed; - -/** - * enum ieee80211_reg_conn_bits - represents Regulatory connectivity field bits. - * - * This enumeration defines bit flags used to represent regulatory connectivity - * field bits. - * - * @IEEE80211_REG_CONN_LPI_VALID: Indicates whether the LPI bit is valid. - * @IEEE80211_REG_CONN_LPI_VALUE: Represents the value of the LPI bit. - * @IEEE80211_REG_CONN_SP_VALID: Indicates whether the SP bit is valid. - * @IEEE80211_REG_CONN_SP_VALUE: Represents the value of the SP bit. - */ -enum ieee80211_reg_conn_bits { - IEEE80211_REG_CONN_LPI_VALID = BIT(0), - IEEE80211_REG_CONN_LPI_VALUE = BIT(1), - IEEE80211_REG_CONN_SP_VALID = BIT(2), - IEEE80211_REG_CONN_SP_VALUE = BIT(3), -}; - -/* transmit power interpretation type of transmit power envelope element */ -enum ieee80211_tx_power_intrpt_type { - IEEE80211_TPE_LOCAL_EIRP, - IEEE80211_TPE_LOCAL_EIRP_PSD, - IEEE80211_TPE_REG_CLIENT_EIRP, - IEEE80211_TPE_REG_CLIENT_EIRP_PSD, -}; - -/* category type of transmit power envelope element */ -enum ieee80211_tx_power_category_6ghz { - IEEE80211_TPE_CAT_6GHZ_DEFAULT = 0, - IEEE80211_TPE_CAT_6GHZ_SUBORDINATE = 1, -}; - -/* - * For IEEE80211_TPE_LOCAL_EIRP / IEEE80211_TPE_REG_CLIENT_EIRP, - * setting to 63.5 dBm means no constraint. - */ -#define IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT 127 - -/* - * For IEEE80211_TPE_LOCAL_EIRP_PSD / IEEE80211_TPE_REG_CLIENT_EIRP_PSD, - * setting to 127 indicates no PSD limit for the 20 MHz channel. - */ -#define IEEE80211_TPE_PSD_NO_LIMIT 127 - -/** - * struct ieee80211_tx_pwr_env - Transmit Power Envelope - * @info: Transmit Power Information field - * @variable: Maximum Transmit Power field - * - * This structure represents the payload of the "Transmit Power - * Envelope element" as described in IEEE Std 802.11ax-2021 section - * 9.4.2.161 - */ -struct ieee80211_tx_pwr_env { - u8 info; - u8 variable[]; -} __packed; - -#define IEEE80211_TX_PWR_ENV_INFO_COUNT 0x7 -#define IEEE80211_TX_PWR_ENV_INFO_INTERPRET 0x38 -#define IEEE80211_TX_PWR_ENV_INFO_CATEGORY 0xC0 - -#define IEEE80211_TX_PWR_ENV_EXT_COUNT 0xF - -static inline bool ieee80211_valid_tpe_element(const u8 *data, u8 len) -{ - const struct ieee80211_tx_pwr_env *env = (const void *)data; - u8 count, interpret, category; - u8 needed = sizeof(*env); - u8 N; /* also called N in the spec */ - - if (len < needed) - return false; - - count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT); - interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET); - category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY); - - switch (category) { - case IEEE80211_TPE_CAT_6GHZ_DEFAULT: - case IEEE80211_TPE_CAT_6GHZ_SUBORDINATE: - break; - default: - return false; - } - - switch (interpret) { - case IEEE80211_TPE_LOCAL_EIRP: - case IEEE80211_TPE_REG_CLIENT_EIRP: - if (count > 3) - return false; - - /* count == 0 encodes 1 value for 20 MHz, etc. */ - needed += count + 1; - - if (len < needed) - return false; - - /* there can be extension fields not accounted for in 'count' */ - - return true; - case IEEE80211_TPE_LOCAL_EIRP_PSD: - case IEEE80211_TPE_REG_CLIENT_EIRP_PSD: - if (count > 4) - return false; - - N = count ? 1 << (count - 1) : 1; - needed += N; - - if (len < needed) - return false; - - if (len > needed) { - u8 K = u8_get_bits(env->variable[N], - IEEE80211_TX_PWR_ENV_EXT_COUNT); - - needed += 1 + K; - if (len < needed) - return false; - } - - return true; - } - - return false; -} - -/* - * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size - * @he_oper_ie: byte data of the He Operations IE, stating from the byte - * after the ext ID byte. It is assumed that he_oper_ie has at least - * sizeof(struct ieee80211_he_operation) bytes, the caller must have - * validated this. - * @return the actual size of the IE data (not including header), or 0 on error - */ -static inline u8 -ieee80211_he_oper_size(const u8 *he_oper_ie) -{ - const struct ieee80211_he_operation *he_oper = (const void *)he_oper_ie; - u8 oper_len = sizeof(struct ieee80211_he_operation); - u32 he_oper_params; - - /* Make sure the input is not NULL */ - if (!he_oper_ie) - return 0; - - /* Calc required length */ - he_oper_params = le32_to_cpu(he_oper->he_oper_params); - if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) - oper_len += 3; - if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) - oper_len++; - if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO) - oper_len += sizeof(struct ieee80211_he_6ghz_oper); - - /* Add the first byte (extension ID) to the total length */ - oper_len++; - - return oper_len; -} - -/** - * ieee80211_he_6ghz_oper - obtain 6 GHz operation field - * @he_oper: HE operation element (must be pre-validated for size) - * but may be %NULL - * - * Return: a pointer to the 6 GHz operation field, or %NULL - */ -static inline const struct ieee80211_he_6ghz_oper * -ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) -{ - const u8 *ret; - u32 he_oper_params; - - if (!he_oper) - return NULL; - - ret = (const void *)&he_oper->optional; - - he_oper_params = le32_to_cpu(he_oper->he_oper_params); - - if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)) - return NULL; - if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) - ret += 3; - if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) - ret++; - - return (const void *)ret; -} - -/* HE Spatial Reuse defines */ -#define IEEE80211_HE_SPR_PSR_DISALLOWED BIT(0) -#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED BIT(1) -#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT BIT(2) -#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT BIT(3) -#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED BIT(4) - -/* - * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size - * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the byte - * after the ext ID byte. It is assumed that he_spr_ie has at least - * sizeof(struct ieee80211_he_spr) bytes, the caller must have validated - * this - * @return the actual size of the IE data (not including header), or 0 on error - */ -static inline u8 -ieee80211_he_spr_size(const u8 *he_spr_ie) -{ - const struct ieee80211_he_spr *he_spr = (const void *)he_spr_ie; - u8 spr_len = sizeof(struct ieee80211_he_spr); - u8 he_spr_params; - - /* Make sure the input is not NULL */ - if (!he_spr_ie) - return 0; - - /* Calc required length */ - he_spr_params = he_spr->he_sr_control; - if (he_spr_params & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT) - spr_len++; - if (he_spr_params & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT) - spr_len += 18; - - /* Add the first byte (extension ID) to the total length */ - spr_len++; - - return spr_len; -} - /* S1G Capabilities Information field */ #define IEEE80211_S1G_CAPABILITY_LEN 15 @@ -2697,6 +1912,9 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ 3 #define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ 4 +/* need HE definitions for EHT functions */ +#include "ieee80211-he.h" + /* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ static inline u8 ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, @@ -3815,24 +3033,6 @@ struct ieee80211_tspec_ie { __le16 medium_time; } __packed; -struct ieee80211_he_6ghz_capa { - /* uses IEEE80211_HE_6GHZ_CAP_* below */ - __le16 capa; -} __packed; - -/* HE 6 GHz band capabilities */ -/* uses enum ieee80211_min_mpdu_spacing values */ -#define IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START 0x0007 -/* uses enum ieee80211_vht_max_ampdu_length_exp values */ -#define IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP 0x0038 -/* uses IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_* values */ -#define IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN 0x00c0 -/* WLAN_HT_CAP_SM_PS_* values */ -#define IEEE80211_HE_6GHZ_CAP_SM_PS 0x0600 -#define IEEE80211_HE_6GHZ_CAP_RD_RESPONDER 0x0800 -#define IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS 0x1000 -#define IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS 0x2000 - /** * ieee80211_get_qos_ctl - get pointer to qos control bytes * @hdr: the frame -- cgit v1.2.3 From 86bc0c662322b4749cd666678d2fdce7015bcae3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:53 +0100 Subject: wifi: ieee80211: split EHT definitions out The ieee80211.h file has gotten very long, continue splitting it by putting EHT definitions into a separate file. Link: https://patch.msgid.link/20251105153843.bf77fe169140.I691267e0edd914c604a5bfd447d33be00044c9b4@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-eht.h | 1182 +++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 1164 +--------------------------------------- 2 files changed, 1184 insertions(+), 1162 deletions(-) create mode 100644 include/linux/ieee80211-eht.h (limited to 'include/linux') diff --git a/include/linux/ieee80211-eht.h b/include/linux/ieee80211-eht.h new file mode 100644 index 000000000000..f9782e46c5e5 --- /dev/null +++ b/include/linux/ieee80211-eht.h @@ -0,0 +1,1182 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 EHT definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_EHT_H +#define LINUX_IEEE80211_EHT_H + +#include +#include +/* need HE definitions for the inlines here */ +#include + +#define IEEE80211_TTLM_MAX_CNT 2 +#define IEEE80211_TTLM_CONTROL_DIRECTION 0x03 +#define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP 0x04 +#define IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT 0x08 +#define IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT 0x10 +#define IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE 0x20 + +#define IEEE80211_TTLM_DIRECTION_DOWN 0 +#define IEEE80211_TTLM_DIRECTION_UP 1 +#define IEEE80211_TTLM_DIRECTION_BOTH 2 + +/** + * struct ieee80211_ttlm_elem - TID-To-Link Mapping element + * + * Defined in section 9.4.2.314 in P802.11be_D4 + * + * @control: the first part of control field + * @optional: the second part of control field + */ +struct ieee80211_ttlm_elem { + u8 control; + u8 optional[]; +} __packed; + +#define IEEE80211_EHT_MCS_NSS_RX 0x0f +#define IEEE80211_EHT_MCS_NSS_TX 0xf0 + +/** + * struct ieee80211_eht_mcs_nss_supp_20mhz_only - EHT 20MHz only station max + * supported NSS for per MCS. + * + * For each field below, bits 0 - 3 indicate the maximal number of spatial + * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams + * for Tx. + * + * @rx_tx_mcs7_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 0 - 7. + * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 8 - 9. + * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 10 - 11. + * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 12 - 13. + * @rx_tx_max_nss: array of the previous fields for easier loop access + */ +struct ieee80211_eht_mcs_nss_supp_20mhz_only { + union { + struct { + u8 rx_tx_mcs7_max_nss; + u8 rx_tx_mcs9_max_nss; + u8 rx_tx_mcs11_max_nss; + u8 rx_tx_mcs13_max_nss; + }; + u8 rx_tx_max_nss[4]; + }; +}; + +/** + * struct ieee80211_eht_mcs_nss_supp_bw - EHT max supported NSS per MCS (except + * 20MHz only stations). + * + * For each field below, bits 0 - 3 indicate the maximal number of spatial + * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams + * for Tx. + * + * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 0 - 9. + * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 10 - 11. + * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 12 - 13. + * @rx_tx_max_nss: array of the previous fields for easier loop access + */ +struct ieee80211_eht_mcs_nss_supp_bw { + union { + struct { + u8 rx_tx_mcs9_max_nss; + u8 rx_tx_mcs11_max_nss; + u8 rx_tx_mcs13_max_nss; + }; + u8 rx_tx_max_nss[3]; + }; +}; + +/** + * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data + * + * This structure is the "EHT Capabilities element" fixed fields as + * described in P802.11be_D2.0 section 9.4.2.313. + * + * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP* + * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP* + */ +struct ieee80211_eht_cap_elem_fixed { + u8 mac_cap_info[2]; + u8 phy_cap_info[9]; +} __packed; + +/** + * struct ieee80211_eht_cap_elem - EHT capabilities element + * @fixed: fixed parts, see &ieee80211_eht_cap_elem_fixed + * @optional: optional parts + */ +struct ieee80211_eht_cap_elem { + struct ieee80211_eht_cap_elem_fixed fixed; + + /* + * Followed by: + * Supported EHT-MCS And NSS Set field: 4, 3, 6 or 9 octets. + * EHT PPE Thresholds field: variable length. + */ + u8 optional[]; +} __packed; + +#define IEEE80211_EHT_OPER_INFO_PRESENT 0x01 +#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT 0x02 +#define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION 0x04 +#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT 0x08 +#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK 0x30 +#define IEEE80211_EHT_OPER_MCS15_DISABLE 0x40 + +/** + * struct ieee80211_eht_operation - eht operation element + * + * This structure is the "EHT Operation Element" fields as + * described in P802.11be_D2.0 section 9.4.2.311 + * + * @params: EHT operation element parameters. See &IEEE80211_EHT_OPER_* + * @basic_mcs_nss: indicates the EHT-MCSs for each number of spatial streams in + * EHT PPDUs that are supported by all EHT STAs in the BSS in transmit and + * receive. + * @optional: optional parts + */ +struct ieee80211_eht_operation { + u8 params; + struct ieee80211_eht_mcs_nss_supp_20mhz_only basic_mcs_nss; + u8 optional[]; +} __packed; + +/** + * struct ieee80211_eht_operation_info - eht operation information + * + * @control: EHT operation information control. + * @ccfs0: defines a channel center frequency for a 20, 40, 80, 160, or 320 MHz + * EHT BSS. + * @ccfs1: defines a channel center frequency for a 160 or 320 MHz EHT BSS. + * @optional: optional parts + */ +struct ieee80211_eht_operation_info { + u8 control; + u8 ccfs0; + u8 ccfs1; + u8 optional[]; +} __packed; + +/* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */ +#define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS 0x01 +#define IEEE80211_EHT_MAC_CAP0_OM_CONTROL 0x02 +#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 0x04 +#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2 0x08 +#define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT 0x10 +#define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC 0x20 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK 0xc0 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895 0 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991 1 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454 2 + +#define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK 0x01 +#define IEEE80211_EHT_MAC_CAP1_EHT_TRS 0x02 +#define IEEE80211_EHT_MAC_CAP1_TXOP_RET 0x04 +#define IEEE80211_EHT_MAC_CAP1_TWO_BQRS 0x08 +#define IEEE80211_EHT_MAC_CAP1_EHT_LINK_ADAPT_MASK 0x30 +#define IEEE80211_EHT_MAC_CAP1_UNSOL_EPCS_PRIO_ACCESS 0x40 + +/* EHT PHY capabilities as defined in P802.11be_D2.0 section 9.4.2.313.3 */ +#define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ 0x02 +#define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ 0x04 +#define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI 0x08 +#define IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO 0x10 +#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER 0x20 +#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE 0x40 + +/* EHT beamformee number of spatial streams <= 80MHz is split */ +#define IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK 0x80 +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK 0x03 + +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK 0x1c +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK 0xe0 + +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK 0x07 +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK 0x38 + +/* EHT number of sounding dimensions for 320MHz is split */ +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK 0xc0 +#define IEEE80211_EHT_PHY_CAP3_SOUNDING_DIM_320MHZ_MASK 0x01 +#define IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK 0x02 +#define IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK 0x04 +#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK 0x08 +#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK 0x10 +#define IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK 0x20 +#define IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK 0x40 +#define IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK 0x80 + +#define IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO 0x01 +#define IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP 0x02 +#define IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP 0x04 +#define IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI 0x08 +#define IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK 0xf0 + +#define IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK 0x01 +#define IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP 0x02 +#define IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP 0x04 +#define IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT 0x08 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK 0x30 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US 0 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US 1 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US 2 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US 3 + +/* Maximum number of supported EHT LTF is split */ +#define IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK 0xc0 +#define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF 0x40 +#define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 + +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x08 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x30 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ 0x40 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 +#define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 + +#define IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW 0x01 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ 0x02 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ 0x04 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ 0x08 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ 0x10 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ 0x20 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ 0x40 +#define IEEE80211_EHT_PHY_CAP7_TB_SOUNDING_FDBK_RATE_LIMIT 0x80 + +#define IEEE80211_EHT_PHY_CAP8_RX_1024QAM_WIDER_BW_DL_OFDMA 0x01 +#define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA 0x02 + +/* + * EHT operation channel width as defined in P802.11be_D2.0 section 9.4.2.311 + */ +#define IEEE80211_EHT_OPER_CHAN_WIDTH 0x7 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ 0 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ 1 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ 2 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ 3 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ 4 + +/* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ +static inline u8 +ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, + const struct ieee80211_eht_cap_elem_fixed *eht_cap, + bool from_ap) +{ + u8 count = 0; + + /* on 2.4 GHz, if it supports 40 MHz, the result is 3 */ + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) + return 3; + + /* on 2.4 GHz, these three bits are reserved, so should be 0 */ + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) + count += 3; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + count += 3; + + if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) + count += 3; + + if (count) + return count; + + return from_ap ? 3 : 4; +} + +/* 802.11be EHT PPE Thresholds */ +#define IEEE80211_EHT_PPE_THRES_NSS_POS 0 +#define IEEE80211_EHT_PPE_THRES_NSS_MASK 0xf +#define IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK 0x1f0 +#define IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE 3 +#define IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE 9 + +/* + * Calculate 802.11be EHT capabilities IE EHT field size + */ +static inline u8 +ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info) +{ + u32 n; + + if (!(phy_cap_info[5] & + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT)) + return 0; + + n = hweight16(ppe_thres_hdr & + IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= 1 + u16_get_bits(ppe_thres_hdr, IEEE80211_EHT_PPE_THRES_NSS_MASK); + + /* + * Each pair is 6 bits, and we need to add the 9 "header" bits to the + * total size. + */ + n = n * IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE * 2 + + IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE; + return DIV_ROUND_UP(n, 8); +} + +static inline bool +ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len, + bool from_ap) +{ + const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data; + u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed); + + if (len < needed || !he_capa) + return false; + + needed += ieee80211_eht_mcs_nss_size((const void *)he_capa, + (const void *)data, + from_ap); + if (len < needed) + return false; + + if (elem->phy_cap_info[5] & + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) { + u16 ppe_thres_hdr; + + if (len < needed + sizeof(ppe_thres_hdr)) + return false; + + ppe_thres_hdr = get_unaligned_le16(data + needed); + needed += ieee80211_eht_ppe_size(ppe_thres_hdr, + elem->phy_cap_info); + } + + return len >= needed; +} + +static inline bool +ieee80211_eht_oper_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_eht_operation *elem = (const void *)data; + u8 needed = sizeof(*elem); + + if (len < needed) + return false; + + if (elem->params & IEEE80211_EHT_OPER_INFO_PRESENT) { + needed += 3; + + if (elem->params & + IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT) + needed += 2; + } + + return len >= needed; +} + +/* must validate ieee80211_eht_oper_size_ok() first */ +static inline u16 +ieee80211_eht_oper_dis_subchan_bitmap(const struct ieee80211_eht_operation *eht_oper) +{ + const struct ieee80211_eht_operation_info *info = + (const void *)eht_oper->optional; + + if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) + return 0; + + if (!(eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) + return 0; + + return get_unaligned_le16(info->optional); +} + +#define IEEE80211_BW_IND_DIS_SUBCH_PRESENT BIT(1) + +struct ieee80211_bandwidth_indication { + u8 params; + struct ieee80211_eht_operation_info info; +} __packed; + +static inline bool +ieee80211_bandwidth_indication_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_bandwidth_indication *bwi = (const void *)data; + + if (len < sizeof(*bwi)) + return false; + + if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT && + len < sizeof(*bwi) + 2) + return false; + + return true; +} + +/* Protected EHT action codes */ +enum ieee80211_protected_eht_actioncode { + WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0, + WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1, + WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ = 3, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP = 4, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN = 5, + WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF = 6, + WLAN_PROTECTED_EHT_ACTION_LINK_RECOMMEND = 7, + WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_REQ = 8, + WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_RESP = 9, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_NOTIF = 10, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ = 11, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP = 12, +}; + +/* multi-link device */ +#define IEEE80211_MLD_MAX_NUM_LINKS 15 + +#define IEEE80211_ML_CONTROL_TYPE 0x0007 +#define IEEE80211_ML_CONTROL_TYPE_BASIC 0 +#define IEEE80211_ML_CONTROL_TYPE_PREQ 1 +#define IEEE80211_ML_CONTROL_TYPE_RECONF 2 +#define IEEE80211_ML_CONTROL_TYPE_TDLS 3 +#define IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS 4 +#define IEEE80211_ML_CONTROL_PRESENCE_MASK 0xfff0 + +struct ieee80211_multi_link_elem { + __le16 control; + u8 variable[]; +} __packed; + +#define IEEE80211_MLC_BASIC_PRES_LINK_ID 0x0010 +#define IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT 0x0020 +#define IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY 0x0040 +#define IEEE80211_MLC_BASIC_PRES_EML_CAPA 0x0080 +#define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP 0x0100 +#define IEEE80211_MLC_BASIC_PRES_MLD_ID 0x0200 +#define IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP 0x0400 + +#define IEEE80211_MED_SYNC_DELAY_DURATION 0x00ff +#define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH 0x0f00 +#define IEEE80211_MED_SYNC_DELAY_SYNC_MAX_NUM_TXOPS 0xf000 + +/* + * Described in P802.11be_D3.0 + * dot11MSDTimerDuration should default to 5484 (i.e. 171.375) + * dot11MSDOFDMEDthreshold defaults to -72 (i.e. 0) + * dot11MSDTXOPMAX defaults to 1 + */ +#define IEEE80211_MED_SYNC_DELAY_DEFAULT 0x10ac + +#define IEEE80211_EML_CAP_EMLSR_SUPP 0x0001 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY 0x000e +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_0US 0 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_32US 1 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_64US 2 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_128US 3 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US 4 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY 0x0070 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_0US 0 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_16US 1 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_32US 2 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_64US 3 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_128US 4 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US 5 +#define IEEE80211_EML_CAP_EMLMR_SUPPORT 0x0080 +#define IEEE80211_EML_CAP_EMLMR_DELAY 0x0700 +#define IEEE80211_EML_CAP_EMLMR_DELAY_0US 0 +#define IEEE80211_EML_CAP_EMLMR_DELAY_32US 1 +#define IEEE80211_EML_CAP_EMLMR_DELAY_64US 2 +#define IEEE80211_EML_CAP_EMLMR_DELAY_128US 3 +#define IEEE80211_EML_CAP_EMLMR_DELAY_256US 4 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT 0x7800 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_0 0 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128US 1 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_256US 2 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_512US 3 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_1TU 4 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_2TU 5 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_4TU 6 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_8TU 7 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_16TU 8 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_32TU 9 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_64TU 10 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU 11 + +#define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS 0x000f +#define IEEE80211_MLD_CAP_OP_SRS_SUPPORT 0x0010 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP 0x0060 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_NO_SUPP 0 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME 1 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_RESERVED 2 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF 3 +#define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND 0x0f80 +#define IEEE80211_MLD_CAP_OP_AAR_SUPPORT 0x1000 +#define IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT 0x2000 +#define IEEE80211_MLD_CAP_OP_ALIGNED_TWT_SUPPORT 0x4000 + +struct ieee80211_mle_basic_common_info { + u8 len; + u8 mld_mac_addr[ETH_ALEN]; + u8 variable[]; +} __packed; + +#define IEEE80211_MLC_PREQ_PRES_MLD_ID 0x0010 + +struct ieee80211_mle_preq_common_info { + u8 len; + u8 variable[]; +} __packed; + +#define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR 0x0010 +#define IEEE80211_MLC_RECONF_PRES_EML_CAPA 0x0020 +#define IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP 0x0040 +#define IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP 0x0080 + +/* no fixed fields in RECONF */ + +struct ieee80211_mle_tdls_common_info { + u8 len; + u8 ap_mld_mac_addr[ETH_ALEN]; +} __packed; + +#define IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR 0x0010 + +/* no fixed fields in PRIO_ACCESS */ + +/** + * ieee80211_mle_common_size - check multi-link element common size + * @data: multi-link element, must already be checked for size using + * ieee80211_mle_size_ok() + * Return: the size of the multi-link element's "common" subfield + */ +static inline u8 ieee80211_mle_common_size(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + + switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { + case IEEE80211_ML_CONTROL_TYPE_BASIC: + case IEEE80211_ML_CONTROL_TYPE_PREQ: + case IEEE80211_ML_CONTROL_TYPE_TDLS: + case IEEE80211_ML_CONTROL_TYPE_RECONF: + case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: + /* + * The length is the first octet pointed by mle->variable so no + * need to add anything + */ + break; + default: + WARN_ON(1); + return 0; + } + + return sizeof(*mle) + mle->variable[0]; +} + +/** + * ieee80211_mle_get_link_id - returns the link ID + * @data: the basic multi link element + * Return: the link ID, or -1 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline int ieee80211_mle_get_link_id(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* common points now at the beginning of ieee80211_mle_basic_common_info */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_LINK_ID)) + return -1; + + return *common; +} + +/** + * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count + * @data: pointer to the basic multi link element + * Return: the BSS Parameter Change Count field value, or -1 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline int +ieee80211_mle_get_bss_param_ch_cnt(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* common points now at the beginning of ieee80211_mle_basic_common_info */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)) + return -1; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + + return *common; +} + +/** + * ieee80211_mle_get_eml_med_sync_delay - returns the medium sync delay + * @data: pointer to the multi-link element + * Return: the medium synchronization delay field value from the multi-link + * element, or the default value (%IEEE80211_MED_SYNC_DELAY_DEFAULT) + * if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u16 ieee80211_mle_get_eml_med_sync_delay(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* common points now at the beginning of ieee80211_mle_basic_common_info */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)) + return IEEE80211_MED_SYNC_DELAY_DEFAULT; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + + return get_unaligned_le16(common); +} + +/** + * ieee80211_mle_get_eml_cap - returns the EML capability + * @data: pointer to the multi-link element + * Return: the EML capability field value from the multi-link element, + * or 0 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u16 ieee80211_mle_get_eml_cap(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* common points now at the beginning of ieee80211_mle_basic_common_info */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + + return get_unaligned_le16(common); +} + +/** + * ieee80211_mle_get_mld_capa_op - returns the MLD capabilities and operations. + * @data: pointer to the multi-link element + * Return: the MLD capabilities and operations field value from the multi-link + * element, or 0 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* + * common points now at the beginning of + * ieee80211_mle_basic_common_info + */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + + return get_unaligned_le16(common); +} + +/* Defined in Figure 9-1074t in P802.11be_D7.0 */ +#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE 0x0001 +#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_RECO_MAX_LINKS_MASK 0x001e +#define IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE 0x0020 +#define IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK 0x0040 +#define IEEE80211_EHT_ML_EXT_MLD_CAPA_BTM_MLD_RECO_MULTI_AP 0x0080 + +/** + * ieee80211_mle_get_ext_mld_capa_op - returns the extended MLD capabilities + * and operations. + * @data: pointer to the multi-link element + * Return: the extended MLD capabilities and operations field value from + * the multi-link element, or 0 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u16 ieee80211_mle_get_ext_mld_capa_op(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* + * common points now at the beginning of + * ieee80211_mle_basic_common_info + */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) + common += 1; + + return get_unaligned_le16(common); +} + +/** + * ieee80211_mle_get_mld_id - returns the MLD ID + * @data: pointer to the multi-link element + * Return: The MLD ID in the given multi-link element, or 0 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u8 ieee80211_mle_get_mld_id(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* + * common points now at the beginning of + * ieee80211_mle_basic_common_info + */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_ID)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) + common += 2; + + return *common; +} + +/** + * ieee80211_mle_size_ok - validate multi-link element size + * @data: pointer to the element data + * @len: length of the containing element + * Return: whether or not the multi-link element size is OK + */ +static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u8 fixed = sizeof(*mle); + u8 common = 0; + bool check_common_len = false; + u16 control; + + if (!data || len < fixed) + return false; + + control = le16_to_cpu(mle->control); + + switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { + case IEEE80211_ML_CONTROL_TYPE_BASIC: + common += sizeof(struct ieee80211_mle_basic_common_info); + check_common_len = true; + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP) + common += 2; + break; + case IEEE80211_ML_CONTROL_TYPE_PREQ: + common += sizeof(struct ieee80211_mle_preq_common_info); + if (control & IEEE80211_MLC_PREQ_PRES_MLD_ID) + common += 1; + check_common_len = true; + break; + case IEEE80211_ML_CONTROL_TYPE_RECONF: + if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) + common += ETH_ALEN; + if (control & IEEE80211_MLC_RECONF_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP) + common += 2; + break; + case IEEE80211_ML_CONTROL_TYPE_TDLS: + common += sizeof(struct ieee80211_mle_tdls_common_info); + check_common_len = true; + break; + case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: + common = ETH_ALEN + 1; + break; + default: + /* we don't know this type */ + return true; + } + + if (len < fixed + common) + return false; + + if (!check_common_len) + return true; + + /* if present, common length is the first octet there */ + return mle->variable[0] >= common; +} + +/** + * ieee80211_mle_type_ok - validate multi-link element type and size + * @data: pointer to the element data + * @type: expected type of the element + * @len: length of the containing element + * Return: whether or not the multi-link element type matches and size is OK + */ +static inline bool ieee80211_mle_type_ok(const u8 *data, u8 type, size_t len) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control; + + if (!ieee80211_mle_size_ok(data, len)) + return false; + + control = le16_to_cpu(mle->control); + + if (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE) == type) + return true; + + return false; +} + +enum ieee80211_mle_subelems { + IEEE80211_MLE_SUBELEM_PER_STA_PROFILE = 0, + IEEE80211_MLE_SUBELEM_FRAGMENT = 254, +}; + +#define IEEE80211_MLE_STA_CONTROL_LINK_ID 0x000f +#define IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE 0x0010 +#define IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 +#define IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT 0x0040 +#define IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT 0x0080 +#define IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT 0x0100 +#define IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT 0x0200 +#define IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE 0x0400 +#define IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT 0x0800 + +struct ieee80211_mle_per_sta_profile { + __le16 control; + u8 sta_info_len; + u8 variable[]; +} __packed; + +/** + * ieee80211_mle_basic_sta_prof_size_ok - validate basic multi-link element sta + * profile size + * @data: pointer to the sub element data + * @len: length of the containing sub element + * Return: %true if the STA profile is large enough, %false otherwise + */ +static inline bool ieee80211_mle_basic_sta_prof_size_ok(const u8 *data, + size_t len) +{ + const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; + u16 control; + u8 fixed = sizeof(*prof); + u8 info_len = 1; + + if (len < fixed) + return false; + + control = le16_to_cpu(prof->control); + + if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) + info_len += 6; + if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) + info_len += 8; + if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && + control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) { + if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) + info_len += 2; + else + info_len += 1; + } + if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT) + info_len += 1; + + return prof->sta_info_len >= info_len && + fixed + prof->sta_info_len - 1 <= len; +} + +/** + * ieee80211_mle_basic_sta_prof_bss_param_ch_cnt - get per-STA profile BSS + * parameter change count + * @prof: the per-STA profile, having been checked with + * ieee80211_mle_basic_sta_prof_size_ok() for the correct length + * + * Return: The BSS parameter change count value if present, 0 otherwise. + */ +static inline u8 +ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(const struct ieee80211_mle_per_sta_profile *prof) +{ + u16 control = le16_to_cpu(prof->control); + const u8 *pos = prof->variable; + + if (!(control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT)) + return 0; + + if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) + pos += 6; + if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) + pos += 2; + if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) + pos += 8; + if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) + pos += 2; + if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && + control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) { + if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) + pos += 2; + else + pos += 1; + } + + return *pos; +} + +#define IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID 0x000f +#define IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE 0x0010 +#define IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 +#define IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT 0x0040 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE 0x0780 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_AP_REM 0 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_OP_PARAM_UPDATE 1 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK 2 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK 3 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_NSTR_STATUS 4 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT 0x0800 + +/** + * ieee80211_mle_reconf_sta_prof_size_ok - validate reconfiguration multi-link + * element sta profile size. + * @data: pointer to the sub element data + * @len: length of the containing sub element + * Return: %true if the STA profile is large enough, %false otherwise + */ +static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data, + size_t len) +{ + const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; + u16 control; + u8 fixed = sizeof(*prof); + u8 info_len = 1; + + if (len < fixed) + return false; + + control = le16_to_cpu(prof->control); + + if (control & IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT) + info_len += ETH_ALEN; + if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT) + info_len += 2; + + return prof->sta_info_len >= info_len && + fixed + prof->sta_info_len - 1 <= len; +} + +#define IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID 0x000f +#define IEEE80211_EPCS_ENA_RESP_BODY_LEN 3 + +static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len) +{ + const struct ieee80211_ttlm_elem *t2l = (const void *)data; + u8 control, fixed = sizeof(*t2l), elem_len = 0; + + if (len < fixed) + return false; + + control = t2l->control; + + if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT) + elem_len += 2; + if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT) + elem_len += 3; + + if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) { + u8 bm_size; + + elem_len += 1; + if (len < fixed + elem_len) + return false; + + if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) + bm_size = 1; + else + bm_size = 2; + + elem_len += hweight8(t2l->optional[0]) * bm_size; + } + + return len >= fixed + elem_len; +} + +/** + * ieee80211_emlsr_pad_delay_in_us - Fetch the EMLSR Padding delay + * in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Padding delay (in microseconds) encoded in the + * EML Capabilities field + */ + +static inline u32 ieee80211_emlsr_pad_delay_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417i—Encoding of the EMLSR + * Padding Delay subfield. + */ + u32 pad_delay = u16_get_bits(eml_cap, + IEEE80211_EML_CAP_EMLSR_PADDING_DELAY); + + if (!pad_delay || + pad_delay > IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US) + return 0; + + return 32 * (1 << (pad_delay - 1)); +} + +/** + * ieee80211_emlsr_trans_delay_in_us - Fetch the EMLSR Transition + * delay in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Transition delay (in microseconds) encoded in the + * EML Capabilities field + */ + +static inline u32 ieee80211_emlsr_trans_delay_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417j—Encoding of the EMLSR + * Transition Delay subfield. + */ + u32 trans_delay = + u16_get_bits(eml_cap, + IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY); + + /* invalid values also just use 0 */ + if (!trans_delay || + trans_delay > IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US) + return 0; + + return 16 * (1 << (trans_delay - 1)); +} + +/** + * ieee80211_eml_trans_timeout_in_us - Fetch the EMLSR Transition + * timeout value in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Transition timeout (in microseconds) encoded in + * the EML Capabilities field + */ + +static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417m—Encoding of the + * Transition Timeout subfield. + */ + u8 timeout = u16_get_bits(eml_cap, + IEEE80211_EML_CAP_TRANSITION_TIMEOUT); + + /* invalid values also just use 0 */ + if (!timeout || timeout > IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU) + return 0; + + return 128 * (1 << (timeout - 1)); +} + +#define for_each_mle_subelement(_elem, _data, _len) \ + if (ieee80211_mle_size_ok(_data, _len)) \ + for_each_element(_elem, \ + _data + ieee80211_mle_common_size(_data),\ + _len - ieee80211_mle_common_size(_data)) + +#endif /* LINUX_IEEE80211_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a3dbbcee00ee..63a9775b059d 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1141,30 +1141,6 @@ ieee80211_s1g_optional_len(__le16 fc) return len; } -#define IEEE80211_TTLM_MAX_CNT 2 -#define IEEE80211_TTLM_CONTROL_DIRECTION 0x03 -#define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP 0x04 -#define IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT 0x08 -#define IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT 0x10 -#define IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE 0x20 - -#define IEEE80211_TTLM_DIRECTION_DOWN 0 -#define IEEE80211_TTLM_DIRECTION_UP 1 -#define IEEE80211_TTLM_DIRECTION_BOTH 2 - -/** - * struct ieee80211_ttlm_elem - TID-To-Link Mapping element - * - * Defined in section 9.4.2.314 in P802.11be_D4 - * - * @control: the first part of control field - * @optional: the second part of control field - */ -struct ieee80211_ttlm_elem { - u8 control; - u8 optional[]; -} __packed; - /** * struct ieee80211_bss_load_elem - BSS Load elemen * @@ -1591,144 +1567,6 @@ struct ieee80211_p2p_noa_attr { #define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F -#define IEEE80211_EHT_MCS_NSS_RX 0x0f -#define IEEE80211_EHT_MCS_NSS_TX 0xf0 - -/** - * struct ieee80211_eht_mcs_nss_supp_20mhz_only - EHT 20MHz only station max - * supported NSS for per MCS. - * - * For each field below, bits 0 - 3 indicate the maximal number of spatial - * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams - * for Tx. - * - * @rx_tx_mcs7_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 0 - 7. - * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 8 - 9. - * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 10 - 11. - * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 12 - 13. - * @rx_tx_max_nss: array of the previous fields for easier loop access - */ -struct ieee80211_eht_mcs_nss_supp_20mhz_only { - union { - struct { - u8 rx_tx_mcs7_max_nss; - u8 rx_tx_mcs9_max_nss; - u8 rx_tx_mcs11_max_nss; - u8 rx_tx_mcs13_max_nss; - }; - u8 rx_tx_max_nss[4]; - }; -}; - -/** - * struct ieee80211_eht_mcs_nss_supp_bw - EHT max supported NSS per MCS (except - * 20MHz only stations). - * - * For each field below, bits 0 - 3 indicate the maximal number of spatial - * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams - * for Tx. - * - * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 0 - 9. - * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 10 - 11. - * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 12 - 13. - * @rx_tx_max_nss: array of the previous fields for easier loop access - */ -struct ieee80211_eht_mcs_nss_supp_bw { - union { - struct { - u8 rx_tx_mcs9_max_nss; - u8 rx_tx_mcs11_max_nss; - u8 rx_tx_mcs13_max_nss; - }; - u8 rx_tx_max_nss[3]; - }; -}; - -/** - * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data - * - * This structure is the "EHT Capabilities element" fixed fields as - * described in P802.11be_D2.0 section 9.4.2.313. - * - * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP* - * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP* - */ -struct ieee80211_eht_cap_elem_fixed { - u8 mac_cap_info[2]; - u8 phy_cap_info[9]; -} __packed; - -/** - * struct ieee80211_eht_cap_elem - EHT capabilities element - * @fixed: fixed parts, see &ieee80211_eht_cap_elem_fixed - * @optional: optional parts - */ -struct ieee80211_eht_cap_elem { - struct ieee80211_eht_cap_elem_fixed fixed; - - /* - * Followed by: - * Supported EHT-MCS And NSS Set field: 4, 3, 6 or 9 octets. - * EHT PPE Thresholds field: variable length. - */ - u8 optional[]; -} __packed; - -#define IEEE80211_EHT_OPER_INFO_PRESENT 0x01 -#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT 0x02 -#define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION 0x04 -#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT 0x08 -#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK 0x30 -#define IEEE80211_EHT_OPER_MCS15_DISABLE 0x40 - -/** - * struct ieee80211_eht_operation - eht operation element - * - * This structure is the "EHT Operation Element" fields as - * described in P802.11be_D2.0 section 9.4.2.311 - * - * @params: EHT operation element parameters. See &IEEE80211_EHT_OPER_* - * @basic_mcs_nss: indicates the EHT-MCSs for each number of spatial streams in - * EHT PPDUs that are supported by all EHT STAs in the BSS in transmit and - * receive. - * @optional: optional parts - */ -struct ieee80211_eht_operation { - u8 params; - struct ieee80211_eht_mcs_nss_supp_20mhz_only basic_mcs_nss; - u8 optional[]; -} __packed; - -/** - * struct ieee80211_eht_operation_info - eht operation information - * - * @control: EHT operation information control. - * @ccfs0: defines a channel center frequency for a 20, 40, 80, 160, or 320 MHz - * EHT BSS. - * @ccfs1: defines a channel center frequency for a 160 or 320 MHz EHT BSS. - * @optional: optional parts - */ -struct ieee80211_eht_operation_info { - u8 control; - u8 ccfs0; - u8 ccfs1; - u8 optional[]; -} __packed; - /* S1G Capabilities Information field */ #define IEEE80211_S1G_CAPABILITY_LEN 15 @@ -1815,258 +1653,6 @@ struct ieee80211_eht_operation_info { #define S1G_2M_PRIMARY_LOCATION_LOWER 0 #define S1G_2M_PRIMARY_LOCATION_UPPER 1 -/* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */ -#define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS 0x01 -#define IEEE80211_EHT_MAC_CAP0_OM_CONTROL 0x02 -#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 0x04 -#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2 0x08 -#define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT 0x10 -#define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC 0x20 -#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK 0xc0 -#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895 0 -#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991 1 -#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454 2 - -#define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK 0x01 -#define IEEE80211_EHT_MAC_CAP1_EHT_TRS 0x02 -#define IEEE80211_EHT_MAC_CAP1_TXOP_RET 0x04 -#define IEEE80211_EHT_MAC_CAP1_TWO_BQRS 0x08 -#define IEEE80211_EHT_MAC_CAP1_EHT_LINK_ADAPT_MASK 0x30 -#define IEEE80211_EHT_MAC_CAP1_UNSOL_EPCS_PRIO_ACCESS 0x40 - -/* EHT PHY capabilities as defined in P802.11be_D2.0 section 9.4.2.313.3 */ -#define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ 0x02 -#define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ 0x04 -#define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI 0x08 -#define IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO 0x10 -#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER 0x20 -#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE 0x40 - -/* EHT beamformee number of spatial streams <= 80MHz is split */ -#define IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK 0x80 -#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK 0x03 - -#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK 0x1c -#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK 0xe0 - -#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK 0x07 -#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK 0x38 - -/* EHT number of sounding dimensions for 320MHz is split */ -#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK 0xc0 -#define IEEE80211_EHT_PHY_CAP3_SOUNDING_DIM_320MHZ_MASK 0x01 -#define IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK 0x02 -#define IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK 0x04 -#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK 0x08 -#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK 0x10 -#define IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK 0x20 -#define IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK 0x40 -#define IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK 0x80 - -#define IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO 0x01 -#define IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP 0x02 -#define IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP 0x04 -#define IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI 0x08 -#define IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK 0xf0 - -#define IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK 0x01 -#define IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP 0x02 -#define IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP 0x04 -#define IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT 0x08 -#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK 0x30 -#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US 0 -#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US 1 -#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US 2 -#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US 3 - -/* Maximum number of supported EHT LTF is split */ -#define IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK 0xc0 -#define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF 0x40 -#define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 - -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x08 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x30 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ 0x40 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 -#define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 - -#define IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW 0x01 -#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ 0x02 -#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ 0x04 -#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ 0x08 -#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ 0x10 -#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ 0x20 -#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ 0x40 -#define IEEE80211_EHT_PHY_CAP7_TB_SOUNDING_FDBK_RATE_LIMIT 0x80 - -#define IEEE80211_EHT_PHY_CAP8_RX_1024QAM_WIDER_BW_DL_OFDMA 0x01 -#define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA 0x02 - -/* - * EHT operation channel width as defined in P802.11be_D2.0 section 9.4.2.311 - */ -#define IEEE80211_EHT_OPER_CHAN_WIDTH 0x7 -#define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ 0 -#define IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ 1 -#define IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ 2 -#define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ 3 -#define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ 4 - -/* need HE definitions for EHT functions */ -#include "ieee80211-he.h" - -/* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ -static inline u8 -ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, - const struct ieee80211_eht_cap_elem_fixed *eht_cap, - bool from_ap) -{ - u8 count = 0; - - /* on 2.4 GHz, if it supports 40 MHz, the result is 3 */ - if (he_cap->phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) - return 3; - - /* on 2.4 GHz, these three bits are reserved, so should be 0 */ - if (he_cap->phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) - count += 3; - - if (he_cap->phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) - count += 3; - - if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) - count += 3; - - if (count) - return count; - - return from_ap ? 3 : 4; -} - -/* 802.11be EHT PPE Thresholds */ -#define IEEE80211_EHT_PPE_THRES_NSS_POS 0 -#define IEEE80211_EHT_PPE_THRES_NSS_MASK 0xf -#define IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK 0x1f0 -#define IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE 3 -#define IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE 9 - -/* - * Calculate 802.11be EHT capabilities IE EHT field size - */ -static inline u8 -ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info) -{ - u32 n; - - if (!(phy_cap_info[5] & - IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT)) - return 0; - - n = hweight16(ppe_thres_hdr & - IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK); - n *= 1 + u16_get_bits(ppe_thres_hdr, IEEE80211_EHT_PPE_THRES_NSS_MASK); - - /* - * Each pair is 6 bits, and we need to add the 9 "header" bits to the - * total size. - */ - n = n * IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE * 2 + - IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE; - return DIV_ROUND_UP(n, 8); -} - -static inline bool -ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len, - bool from_ap) -{ - const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data; - u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed); - - if (len < needed || !he_capa) - return false; - - needed += ieee80211_eht_mcs_nss_size((const void *)he_capa, - (const void *)data, - from_ap); - if (len < needed) - return false; - - if (elem->phy_cap_info[5] & - IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) { - u16 ppe_thres_hdr; - - if (len < needed + sizeof(ppe_thres_hdr)) - return false; - - ppe_thres_hdr = get_unaligned_le16(data + needed); - needed += ieee80211_eht_ppe_size(ppe_thres_hdr, - elem->phy_cap_info); - } - - return len >= needed; -} - -static inline bool -ieee80211_eht_oper_size_ok(const u8 *data, u8 len) -{ - const struct ieee80211_eht_operation *elem = (const void *)data; - u8 needed = sizeof(*elem); - - if (len < needed) - return false; - - if (elem->params & IEEE80211_EHT_OPER_INFO_PRESENT) { - needed += 3; - - if (elem->params & - IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT) - needed += 2; - } - - return len >= needed; -} - -/* must validate ieee80211_eht_oper_size_ok() first */ -static inline u16 -ieee80211_eht_oper_dis_subchan_bitmap(const struct ieee80211_eht_operation *eht_oper) -{ - const struct ieee80211_eht_operation_info *info = - (const void *)eht_oper->optional; - - if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) - return 0; - - if (!(eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) - return 0; - - return get_unaligned_le16(info->optional); -} - -#define IEEE80211_BW_IND_DIS_SUBCH_PRESENT BIT(1) - -struct ieee80211_bandwidth_indication { - u8 params; - struct ieee80211_eht_operation_info info; -} __packed; - -static inline bool -ieee80211_bandwidth_indication_size_ok(const u8 *data, u8 len) -{ - const struct ieee80211_bandwidth_indication *bwi = (const void *)data; - - if (len < sizeof(*bwi)) - return false; - - if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT && - len < sizeof(*bwi) + 2) - return false; - - return true; -} - #define LISTEN_INT_USF GENMASK(15, 14) #define LISTEN_INT_UI GENMASK(13, 0) @@ -2587,23 +2173,6 @@ enum ieee80211_unprotected_wnm_actioncode { WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE = 1, }; -/* Protected EHT action codes */ -enum ieee80211_protected_eht_actioncode { - WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0, - WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1, - WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2, - WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ = 3, - WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP = 4, - WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN = 5, - WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF = 6, - WLAN_PROTECTED_EHT_ACTION_LINK_RECOMMEND = 7, - WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_REQ = 8, - WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_RESP = 9, - WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_NOTIF = 10, - WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ = 11, - WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP = 12, -}; - /* Security key length */ enum ieee80211_key_len { WLAN_KEY_LEN_WEP40 = 5, @@ -3855,737 +3424,6 @@ struct ieee80211_tbtt_info_ge_11 { struct ieee80211_rnr_mld_params mld_params; } __packed; -/* multi-link device */ -#define IEEE80211_MLD_MAX_NUM_LINKS 15 - -#define IEEE80211_ML_CONTROL_TYPE 0x0007 -#define IEEE80211_ML_CONTROL_TYPE_BASIC 0 -#define IEEE80211_ML_CONTROL_TYPE_PREQ 1 -#define IEEE80211_ML_CONTROL_TYPE_RECONF 2 -#define IEEE80211_ML_CONTROL_TYPE_TDLS 3 -#define IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS 4 -#define IEEE80211_ML_CONTROL_PRESENCE_MASK 0xfff0 - -struct ieee80211_multi_link_elem { - __le16 control; - u8 variable[]; -} __packed; - -#define IEEE80211_MLC_BASIC_PRES_LINK_ID 0x0010 -#define IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT 0x0020 -#define IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY 0x0040 -#define IEEE80211_MLC_BASIC_PRES_EML_CAPA 0x0080 -#define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP 0x0100 -#define IEEE80211_MLC_BASIC_PRES_MLD_ID 0x0200 -#define IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP 0x0400 - -#define IEEE80211_MED_SYNC_DELAY_DURATION 0x00ff -#define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH 0x0f00 -#define IEEE80211_MED_SYNC_DELAY_SYNC_MAX_NUM_TXOPS 0xf000 - -/* - * Described in P802.11be_D3.0 - * dot11MSDTimerDuration should default to 5484 (i.e. 171.375) - * dot11MSDOFDMEDthreshold defaults to -72 (i.e. 0) - * dot11MSDTXOPMAX defaults to 1 - */ -#define IEEE80211_MED_SYNC_DELAY_DEFAULT 0x10ac - -#define IEEE80211_EML_CAP_EMLSR_SUPP 0x0001 -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY 0x000e -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_0US 0 -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_32US 1 -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_64US 2 -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_128US 3 -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US 4 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY 0x0070 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_0US 0 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_16US 1 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_32US 2 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_64US 3 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_128US 4 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US 5 -#define IEEE80211_EML_CAP_EMLMR_SUPPORT 0x0080 -#define IEEE80211_EML_CAP_EMLMR_DELAY 0x0700 -#define IEEE80211_EML_CAP_EMLMR_DELAY_0US 0 -#define IEEE80211_EML_CAP_EMLMR_DELAY_32US 1 -#define IEEE80211_EML_CAP_EMLMR_DELAY_64US 2 -#define IEEE80211_EML_CAP_EMLMR_DELAY_128US 3 -#define IEEE80211_EML_CAP_EMLMR_DELAY_256US 4 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT 0x7800 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_0 0 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128US 1 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_256US 2 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_512US 3 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_1TU 4 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_2TU 5 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_4TU 6 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_8TU 7 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_16TU 8 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_32TU 9 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_64TU 10 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU 11 - -#define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS 0x000f -#define IEEE80211_MLD_CAP_OP_SRS_SUPPORT 0x0010 -#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP 0x0060 -#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_NO_SUPP 0 -#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME 1 -#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_RESERVED 2 -#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF 3 -#define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND 0x0f80 -#define IEEE80211_MLD_CAP_OP_AAR_SUPPORT 0x1000 -#define IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT 0x2000 -#define IEEE80211_MLD_CAP_OP_ALIGNED_TWT_SUPPORT 0x4000 - -struct ieee80211_mle_basic_common_info { - u8 len; - u8 mld_mac_addr[ETH_ALEN]; - u8 variable[]; -} __packed; - -#define IEEE80211_MLC_PREQ_PRES_MLD_ID 0x0010 - -struct ieee80211_mle_preq_common_info { - u8 len; - u8 variable[]; -} __packed; - -#define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR 0x0010 -#define IEEE80211_MLC_RECONF_PRES_EML_CAPA 0x0020 -#define IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP 0x0040 -#define IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP 0x0080 - -/* no fixed fields in RECONF */ - -struct ieee80211_mle_tdls_common_info { - u8 len; - u8 ap_mld_mac_addr[ETH_ALEN]; -} __packed; - -#define IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR 0x0010 - -/* no fixed fields in PRIO_ACCESS */ - -/** - * ieee80211_mle_common_size - check multi-link element common size - * @data: multi-link element, must already be checked for size using - * ieee80211_mle_size_ok() - * Return: the size of the multi-link element's "common" subfield - */ -static inline u8 ieee80211_mle_common_size(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - - switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { - case IEEE80211_ML_CONTROL_TYPE_BASIC: - case IEEE80211_ML_CONTROL_TYPE_PREQ: - case IEEE80211_ML_CONTROL_TYPE_TDLS: - case IEEE80211_ML_CONTROL_TYPE_RECONF: - case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: - /* - * The length is the first octet pointed by mle->variable so no - * need to add anything - */ - break; - default: - WARN_ON(1); - return 0; - } - - return sizeof(*mle) + mle->variable[0]; -} - -/** - * ieee80211_mle_get_link_id - returns the link ID - * @data: the basic multi link element - * Return: the link ID, or -1 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline int ieee80211_mle_get_link_id(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* common points now at the beginning of ieee80211_mle_basic_common_info */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_LINK_ID)) - return -1; - - return *common; -} - -/** - * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count - * @data: pointer to the basic multi link element - * Return: the BSS Parameter Change Count field value, or -1 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline int -ieee80211_mle_get_bss_param_ch_cnt(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* common points now at the beginning of ieee80211_mle_basic_common_info */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)) - return -1; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - - return *common; -} - -/** - * ieee80211_mle_get_eml_med_sync_delay - returns the medium sync delay - * @data: pointer to the multi-link element - * Return: the medium synchronization delay field value from the multi-link - * element, or the default value (%IEEE80211_MED_SYNC_DELAY_DEFAULT) - * if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline u16 ieee80211_mle_get_eml_med_sync_delay(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* common points now at the beginning of ieee80211_mle_basic_common_info */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)) - return IEEE80211_MED_SYNC_DELAY_DEFAULT; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - - return get_unaligned_le16(common); -} - -/** - * ieee80211_mle_get_eml_cap - returns the EML capability - * @data: pointer to the multi-link element - * Return: the EML capability field value from the multi-link element, - * or 0 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline u16 ieee80211_mle_get_eml_cap(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* common points now at the beginning of ieee80211_mle_basic_common_info */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)) - return 0; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) - common += 2; - - return get_unaligned_le16(common); -} - -/** - * ieee80211_mle_get_mld_capa_op - returns the MLD capabilities and operations. - * @data: pointer to the multi-link element - * Return: the MLD capabilities and operations field value from the multi-link - * element, or 0 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* - * common points now at the beginning of - * ieee80211_mle_basic_common_info - */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)) - return 0; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) - common += 2; - - return get_unaligned_le16(common); -} - -/* Defined in Figure 9-1074t in P802.11be_D7.0 */ -#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE 0x0001 -#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_RECO_MAX_LINKS_MASK 0x001e -#define IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE 0x0020 -#define IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK 0x0040 -#define IEEE80211_EHT_ML_EXT_MLD_CAPA_BTM_MLD_RECO_MULTI_AP 0x0080 - -/** - * ieee80211_mle_get_ext_mld_capa_op - returns the extended MLD capabilities - * and operations. - * @data: pointer to the multi-link element - * Return: the extended MLD capabilities and operations field value from - * the multi-link element, or 0 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline u16 ieee80211_mle_get_ext_mld_capa_op(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* - * common points now at the beginning of - * ieee80211_mle_basic_common_info - */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP)) - return 0; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) - common += 1; - - return get_unaligned_le16(common); -} - -/** - * ieee80211_mle_get_mld_id - returns the MLD ID - * @data: pointer to the multi-link element - * Return: The MLD ID in the given multi-link element, or 0 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline u8 ieee80211_mle_get_mld_id(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* - * common points now at the beginning of - * ieee80211_mle_basic_common_info - */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_ID)) - return 0; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) - common += 2; - - return *common; -} - -/** - * ieee80211_mle_size_ok - validate multi-link element size - * @data: pointer to the element data - * @len: length of the containing element - * Return: whether or not the multi-link element size is OK - */ -static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u8 fixed = sizeof(*mle); - u8 common = 0; - bool check_common_len = false; - u16 control; - - if (!data || len < fixed) - return false; - - control = le16_to_cpu(mle->control); - - switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { - case IEEE80211_ML_CONTROL_TYPE_BASIC: - common += sizeof(struct ieee80211_mle_basic_common_info); - check_common_len = true; - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP) - common += 2; - break; - case IEEE80211_ML_CONTROL_TYPE_PREQ: - common += sizeof(struct ieee80211_mle_preq_common_info); - if (control & IEEE80211_MLC_PREQ_PRES_MLD_ID) - common += 1; - check_common_len = true; - break; - case IEEE80211_ML_CONTROL_TYPE_RECONF: - if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) - common += ETH_ALEN; - if (control & IEEE80211_MLC_RECONF_PRES_EML_CAPA) - common += 2; - if (control & IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP) - common += 2; - if (control & IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP) - common += 2; - break; - case IEEE80211_ML_CONTROL_TYPE_TDLS: - common += sizeof(struct ieee80211_mle_tdls_common_info); - check_common_len = true; - break; - case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: - common = ETH_ALEN + 1; - break; - default: - /* we don't know this type */ - return true; - } - - if (len < fixed + common) - return false; - - if (!check_common_len) - return true; - - /* if present, common length is the first octet there */ - return mle->variable[0] >= common; -} - -/** - * ieee80211_mle_type_ok - validate multi-link element type and size - * @data: pointer to the element data - * @type: expected type of the element - * @len: length of the containing element - * Return: whether or not the multi-link element type matches and size is OK - */ -static inline bool ieee80211_mle_type_ok(const u8 *data, u8 type, size_t len) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control; - - if (!ieee80211_mle_size_ok(data, len)) - return false; - - control = le16_to_cpu(mle->control); - - if (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE) == type) - return true; - - return false; -} - -enum ieee80211_mle_subelems { - IEEE80211_MLE_SUBELEM_PER_STA_PROFILE = 0, - IEEE80211_MLE_SUBELEM_FRAGMENT = 254, -}; - -#define IEEE80211_MLE_STA_CONTROL_LINK_ID 0x000f -#define IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE 0x0010 -#define IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 -#define IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT 0x0040 -#define IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT 0x0080 -#define IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT 0x0100 -#define IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT 0x0200 -#define IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE 0x0400 -#define IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT 0x0800 - -struct ieee80211_mle_per_sta_profile { - __le16 control; - u8 sta_info_len; - u8 variable[]; -} __packed; - -/** - * ieee80211_mle_basic_sta_prof_size_ok - validate basic multi-link element sta - * profile size - * @data: pointer to the sub element data - * @len: length of the containing sub element - * Return: %true if the STA profile is large enough, %false otherwise - */ -static inline bool ieee80211_mle_basic_sta_prof_size_ok(const u8 *data, - size_t len) -{ - const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; - u16 control; - u8 fixed = sizeof(*prof); - u8 info_len = 1; - - if (len < fixed) - return false; - - control = le16_to_cpu(prof->control); - - if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) - info_len += 6; - if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) - info_len += 2; - if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) - info_len += 8; - if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) - info_len += 2; - if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && - control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) { - if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) - info_len += 2; - else - info_len += 1; - } - if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT) - info_len += 1; - - return prof->sta_info_len >= info_len && - fixed + prof->sta_info_len - 1 <= len; -} - -/** - * ieee80211_mle_basic_sta_prof_bss_param_ch_cnt - get per-STA profile BSS - * parameter change count - * @prof: the per-STA profile, having been checked with - * ieee80211_mle_basic_sta_prof_size_ok() for the correct length - * - * Return: The BSS parameter change count value if present, 0 otherwise. - */ -static inline u8 -ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(const struct ieee80211_mle_per_sta_profile *prof) -{ - u16 control = le16_to_cpu(prof->control); - const u8 *pos = prof->variable; - - if (!(control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT)) - return 0; - - if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) - pos += 6; - if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) - pos += 2; - if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) - pos += 8; - if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) - pos += 2; - if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && - control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) { - if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) - pos += 2; - else - pos += 1; - } - - return *pos; -} - -#define IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID 0x000f -#define IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE 0x0010 -#define IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 -#define IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT 0x0040 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE 0x0780 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_AP_REM 0 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_OP_PARAM_UPDATE 1 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK 2 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK 3 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_NSTR_STATUS 4 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT 0x0800 - -/** - * ieee80211_mle_reconf_sta_prof_size_ok - validate reconfiguration multi-link - * element sta profile size. - * @data: pointer to the sub element data - * @len: length of the containing sub element - * Return: %true if the STA profile is large enough, %false otherwise - */ -static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data, - size_t len) -{ - const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; - u16 control; - u8 fixed = sizeof(*prof); - u8 info_len = 1; - - if (len < fixed) - return false; - - control = le16_to_cpu(prof->control); - - if (control & IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT) - info_len += ETH_ALEN; - if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT) - info_len += 2; - if (control & IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT) - info_len += 2; - - return prof->sta_info_len >= info_len && - fixed + prof->sta_info_len - 1 <= len; -} - -#define IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID 0x000f -#define IEEE80211_EPCS_ENA_RESP_BODY_LEN 3 - -static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len) -{ - const struct ieee80211_ttlm_elem *t2l = (const void *)data; - u8 control, fixed = sizeof(*t2l), elem_len = 0; - - if (len < fixed) - return false; - - control = t2l->control; - - if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT) - elem_len += 2; - if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT) - elem_len += 3; - - if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) { - u8 bm_size; - - elem_len += 1; - if (len < fixed + elem_len) - return false; - - if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) - bm_size = 1; - else - bm_size = 2; - - elem_len += hweight8(t2l->optional[0]) * bm_size; - } - - return len >= fixed + elem_len; -} - -/** - * ieee80211_emlsr_pad_delay_in_us - Fetch the EMLSR Padding delay - * in microseconds - * @eml_cap: EML capabilities field value from common info field of - * the Multi-link element - * Return: the EMLSR Padding delay (in microseconds) encoded in the - * EML Capabilities field - */ - -static inline u32 ieee80211_emlsr_pad_delay_in_us(u16 eml_cap) -{ - /* IEEE Std 802.11be-2024 Table 9-417i—Encoding of the EMLSR - * Padding Delay subfield. - */ - u32 pad_delay = u16_get_bits(eml_cap, - IEEE80211_EML_CAP_EMLSR_PADDING_DELAY); - - if (!pad_delay || - pad_delay > IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US) - return 0; - - return 32 * (1 << (pad_delay - 1)); -} - -/** - * ieee80211_emlsr_trans_delay_in_us - Fetch the EMLSR Transition - * delay in microseconds - * @eml_cap: EML capabilities field value from common info field of - * the Multi-link element - * Return: the EMLSR Transition delay (in microseconds) encoded in the - * EML Capabilities field - */ - -static inline u32 ieee80211_emlsr_trans_delay_in_us(u16 eml_cap) -{ - /* IEEE Std 802.11be-2024 Table 9-417j—Encoding of the EMLSR - * Transition Delay subfield. - */ - u32 trans_delay = - u16_get_bits(eml_cap, - IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY); - - /* invalid values also just use 0 */ - if (!trans_delay || - trans_delay > IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US) - return 0; - - return 16 * (1 << (trans_delay - 1)); -} - -/** - * ieee80211_eml_trans_timeout_in_us - Fetch the EMLSR Transition - * timeout value in microseconds - * @eml_cap: EML capabilities field value from common info field of - * the Multi-link element - * Return: the EMLSR Transition timeout (in microseconds) encoded in - * the EML Capabilities field - */ - -static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) -{ - /* IEEE Std 802.11be-2024 Table 9-417m—Encoding of the - * Transition Timeout subfield. - */ - u8 timeout = u16_get_bits(eml_cap, - IEEE80211_EML_CAP_TRANSITION_TIMEOUT); - - /* invalid values also just use 0 */ - if (!timeout || timeout > IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU) - return 0; - - return 128 * (1 << (timeout - 1)); -} - -#define for_each_mle_subelement(_elem, _data, _len) \ - if (ieee80211_mle_size_ok(_data, _len)) \ - for_each_element(_elem, \ - _data + ieee80211_mle_common_size(_data),\ - _len - ieee80211_mle_common_size(_data)) - /* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */ #define NAN_OP_MODE_PHY_MODE_VHT 0x01 #define NAN_OP_MODE_PHY_MODE_HE 0x10 @@ -4605,6 +3443,8 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) #include "ieee80211-ht.h" #include "ieee80211-vht.h" +#include "ieee80211-he.h" +#include "ieee80211-eht.h" #include "ieee80211-mesh.h" #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From 00105d7600bfb171037783da5f26e2565c7d2106 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:54 +0100 Subject: wifi: ieee80211: split S1G definitions out The ieee80211.h file has gotten very long, continue splitting it by putting S1G definitions into a separate file. Link: https://patch.msgid.link/20251105153843.82c0bddee6e3.Ic6646615286dad240b42e31e9d428c7e4ea40ce0@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-s1g.h | 575 +++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 585 ++---------------------------------------- 2 files changed, 591 insertions(+), 569 deletions(-) create mode 100644 include/linux/ieee80211-s1g.h (limited to 'include/linux') diff --git a/include/linux/ieee80211-s1g.h b/include/linux/ieee80211-s1g.h new file mode 100644 index 000000000000..5b9ed2dcc00e --- /dev/null +++ b/include/linux/ieee80211-s1g.h @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 S1G definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_S1G_H +#define LINUX_IEEE80211_S1G_H + +#include +#include + +/* bits unique to S1G beacon frame control */ +#define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 +#define IEEE80211_S1G_BCN_CSSID 0x200 +#define IEEE80211_S1G_BCN_ANO 0x400 + +/* see 802.11ah-2016 9.9 NDP CMAC frames */ +#define IEEE80211_S1G_1MHZ_NDP_BITS 25 +#define IEEE80211_S1G_1MHZ_NDP_BYTES 4 +#define IEEE80211_S1G_2MHZ_NDP_BITS 37 +#define IEEE80211_S1G_2MHZ_NDP_BYTES 5 + +/** + * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT && + * IEEE80211_STYPE_S1G_BEACON + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an S1G beacon + */ +static inline bool ieee80211_is_s1g_beacon(__le16 fc) +{ + return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | + IEEE80211_FCTL_STYPE)) == + cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON); +} + +/** + * ieee80211_s1g_has_next_tbtt - check if IEEE80211_S1G_BCN_NEXT_TBTT + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * next TBTT field + */ +static inline bool ieee80211_s1g_has_next_tbtt(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT)); +} + +/** + * ieee80211_s1g_has_ano - check if IEEE80211_S1G_BCN_ANO + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * ANO field + */ +static inline bool ieee80211_s1g_has_ano(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_ANO)); +} + +/** + * ieee80211_s1g_has_cssid - check if IEEE80211_S1G_BCN_CSSID + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * compressed SSID field + */ +static inline bool ieee80211_s1g_has_cssid(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_CSSID)); +} + +/** + * enum ieee80211_s1g_chanwidth - S1G channel widths + * These are defined in IEEE802.11-2016ah Table 10-20 + * as BSS Channel Width + * + * @IEEE80211_S1G_CHANWIDTH_1MHZ: 1MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_2MHZ: 2MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_4MHZ: 4MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_8MHZ: 8MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_16MHZ: 16MHz operating channel + */ +enum ieee80211_s1g_chanwidth { + IEEE80211_S1G_CHANWIDTH_1MHZ = 0, + IEEE80211_S1G_CHANWIDTH_2MHZ = 1, + IEEE80211_S1G_CHANWIDTH_4MHZ = 3, + IEEE80211_S1G_CHANWIDTH_8MHZ = 7, + IEEE80211_S1G_CHANWIDTH_16MHZ = 15, +}; + +/** + * enum ieee80211_s1g_pri_chanwidth - S1G primary channel widths + * described in IEEE80211-2024 Table 10-39. + * + * @IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: 2MHz primary channel + * @IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: 1MHz primary channel + */ +enum ieee80211_s1g_pri_chanwidth { + IEEE80211_S1G_PRI_CHANWIDTH_2MHZ = 0, + IEEE80211_S1G_PRI_CHANWIDTH_1MHZ = 1, +}; + +/** + * struct ieee80211_s1g_bcn_compat_ie - S1G Beacon Compatibility element + * @compat_info: Compatibility Information + * @beacon_int: Beacon Interval + * @tsf_completion: TSF Completion + * + * This structure represents the payload of the "S1G Beacon + * Compatibility element" as described in IEEE Std 802.11-2020 section + * 9.4.2.196. + */ +struct ieee80211_s1g_bcn_compat_ie { + __le16 compat_info; + __le16 beacon_int; + __le32 tsf_completion; +} __packed; + +/** + * struct ieee80211_s1g_oper_ie - S1G Operation element + * @ch_width: S1G Operation Information Channel Width + * @oper_class: S1G Operation Information Operating Class + * @primary_ch: S1G Operation Information Primary Channel Number + * @oper_ch: S1G Operation Information Channel Center Frequency + * @basic_mcs_nss: Basic S1G-MCS and NSS Set + * + * This structure represents the payload of the "S1G Operation + * element" as described in IEEE Std 802.11-2020 section 9.4.2.212. + */ +struct ieee80211_s1g_oper_ie { + u8 ch_width; + u8 oper_class; + u8 primary_ch; + u8 oper_ch; + __le16 basic_mcs_nss; +} __packed; + +/** + * struct ieee80211_aid_response_ie - AID Response element + * @aid: AID/Group AID + * @switch_count: AID Switch Count + * @response_int: AID Response Interval + * + * This structure represents the payload of the "AID Response element" + * as described in IEEE Std 802.11-2020 section 9.4.2.194. + */ +struct ieee80211_aid_response_ie { + __le16 aid; + u8 switch_count; + __le16 response_int; +} __packed; + +struct ieee80211_s1g_cap { + u8 capab_info[10]; + u8 supp_mcs_nss[5]; +} __packed; + +/** + * ieee80211_s1g_optional_len - determine length of optional S1G beacon fields + * @fc: frame control bytes in little-endian byteorder + * Return: total length in bytes of the optional fixed-length fields + * + * S1G beacons may contain up to three optional fixed-length fields that + * precede the variable-length elements. Whether these fields are present + * is indicated by flags in the frame control field. + * + * From IEEE 802.11-2024 section 9.3.4.3: + * - Next TBTT field may be 0 or 3 bytes + * - Short SSID field may be 0 or 4 bytes + * - Access Network Options (ANO) field may be 0 or 1 byte + */ +static inline size_t +ieee80211_s1g_optional_len(__le16 fc) +{ + size_t len = 0; + + if (ieee80211_s1g_has_next_tbtt(fc)) + len += 3; + + if (ieee80211_s1g_has_cssid(fc)) + len += 4; + + if (ieee80211_s1g_has_ano(fc)) + len += 1; + + return len; +} + +/* S1G Capabilities Information field */ +#define IEEE80211_S1G_CAPABILITY_LEN 15 + +#define S1G_CAP0_S1G_LONG BIT(0) +#define S1G_CAP0_SGI_1MHZ BIT(1) +#define S1G_CAP0_SGI_2MHZ BIT(2) +#define S1G_CAP0_SGI_4MHZ BIT(3) +#define S1G_CAP0_SGI_8MHZ BIT(4) +#define S1G_CAP0_SGI_16MHZ BIT(5) +#define S1G_CAP0_SUPP_CH_WIDTH GENMASK(7, 6) + +#define S1G_SUPP_CH_WIDTH_2 0 +#define S1G_SUPP_CH_WIDTH_4 1 +#define S1G_SUPP_CH_WIDTH_8 2 +#define S1G_SUPP_CH_WIDTH_16 3 +#define S1G_SUPP_CH_WIDTH_MAX(cap) ((1 << FIELD_GET(S1G_CAP0_SUPP_CH_WIDTH, \ + cap[0])) << 1) + +#define S1G_CAP1_RX_LDPC BIT(0) +#define S1G_CAP1_TX_STBC BIT(1) +#define S1G_CAP1_RX_STBC BIT(2) +#define S1G_CAP1_SU_BFER BIT(3) +#define S1G_CAP1_SU_BFEE BIT(4) +#define S1G_CAP1_BFEE_STS GENMASK(7, 5) + +#define S1G_CAP2_SOUNDING_DIMENSIONS GENMASK(2, 0) +#define S1G_CAP2_MU_BFER BIT(3) +#define S1G_CAP2_MU_BFEE BIT(4) +#define S1G_CAP2_PLUS_HTC_VHT BIT(5) +#define S1G_CAP2_TRAVELING_PILOT GENMASK(7, 6) + +#define S1G_CAP3_RD_RESPONDER BIT(0) +#define S1G_CAP3_HT_DELAYED_BA BIT(1) +#define S1G_CAP3_MAX_MPDU_LEN BIT(2) +#define S1G_CAP3_MAX_AMPDU_LEN_EXP GENMASK(4, 3) +#define S1G_CAP3_MIN_MPDU_START GENMASK(7, 5) + +#define S1G_CAP4_UPLINK_SYNC BIT(0) +#define S1G_CAP4_DYNAMIC_AID BIT(1) +#define S1G_CAP4_BAT BIT(2) +#define S1G_CAP4_TIME_ADE BIT(3) +#define S1G_CAP4_NON_TIM BIT(4) +#define S1G_CAP4_GROUP_AID BIT(5) +#define S1G_CAP4_STA_TYPE GENMASK(7, 6) + +#define S1G_CAP5_CENT_AUTH_CONTROL BIT(0) +#define S1G_CAP5_DIST_AUTH_CONTROL BIT(1) +#define S1G_CAP5_AMSDU BIT(2) +#define S1G_CAP5_AMPDU BIT(3) +#define S1G_CAP5_ASYMMETRIC_BA BIT(4) +#define S1G_CAP5_FLOW_CONTROL BIT(5) +#define S1G_CAP5_SECTORIZED_BEAM GENMASK(7, 6) + +#define S1G_CAP6_OBSS_MITIGATION BIT(0) +#define S1G_CAP6_FRAGMENT_BA BIT(1) +#define S1G_CAP6_NDP_PS_POLL BIT(2) +#define S1G_CAP6_RAW_OPERATION BIT(3) +#define S1G_CAP6_PAGE_SLICING BIT(4) +#define S1G_CAP6_TXOP_SHARING_IMP_ACK BIT(5) +#define S1G_CAP6_VHT_LINK_ADAPT GENMASK(7, 6) + +#define S1G_CAP7_TACK_AS_PS_POLL BIT(0) +#define S1G_CAP7_DUP_1MHZ BIT(1) +#define S1G_CAP7_MCS_NEGOTIATION BIT(2) +#define S1G_CAP7_1MHZ_CTL_RESPONSE_PREAMBLE BIT(3) +#define S1G_CAP7_NDP_BFING_REPORT_POLL BIT(4) +#define S1G_CAP7_UNSOLICITED_DYN_AID BIT(5) +#define S1G_CAP7_SECTOR_TRAINING_OPERATION BIT(6) +#define S1G_CAP7_TEMP_PS_MODE_SWITCH BIT(7) + +#define S1G_CAP8_TWT_GROUPING BIT(0) +#define S1G_CAP8_BDT BIT(1) +#define S1G_CAP8_COLOR GENMASK(4, 2) +#define S1G_CAP8_TWT_REQUEST BIT(5) +#define S1G_CAP8_TWT_RESPOND BIT(6) +#define S1G_CAP8_PV1_FRAME BIT(7) + +#define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) + +#define S1G_OPER_CH_WIDTH_PRIMARY BIT(0) +#define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) +#define S1G_OPER_CH_PRIMARY_LOCATION BIT(5) + +#define S1G_2M_PRIMARY_LOCATION_LOWER 0 +#define S1G_2M_PRIMARY_LOCATION_UPPER 1 + +#define LISTEN_INT_USF GENMASK(15, 14) +#define LISTEN_INT_UI GENMASK(13, 0) + +#define IEEE80211_MAX_USF FIELD_MAX(LISTEN_INT_USF) +#define IEEE80211_MAX_UI FIELD_MAX(LISTEN_INT_UI) + +/* S1G encoding types */ +#define IEEE80211_S1G_TIM_ENC_MODE_BLOCK 0 +#define IEEE80211_S1G_TIM_ENC_MODE_SINGLE 1 +#define IEEE80211_S1G_TIM_ENC_MODE_OLB 2 + +enum ieee80211_s1g_actioncode { + WLAN_S1G_AID_SWITCH_REQUEST, + WLAN_S1G_AID_SWITCH_RESPONSE, + WLAN_S1G_SYNC_CONTROL, + WLAN_S1G_STA_INFO_ANNOUNCE, + WLAN_S1G_EDCA_PARAM_SET, + WLAN_S1G_EL_OPERATION, + WLAN_S1G_TWT_SETUP, + WLAN_S1G_TWT_TEARDOWN, + WLAN_S1G_SECT_GROUP_ID_LIST, + WLAN_S1G_SECT_ID_FEEDBACK, + WLAN_S1G_TWT_INFORMATION = 11, +}; + +/** + * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon + * @fc: frame control bytes in little-endian byteorder + * @variable: pointer to the beacon frame elements + * @variable_len: length of the frame elements + * Return: whether or not the frame is an S1G short beacon. As per + * IEEE80211-2024 11.1.3.10.1, The S1G beacon compatibility element shall + * always be present as the first element in beacon frames generated at a + * TBTT (Target Beacon Transmission Time), so any frame not containing + * this element must have been generated at a TSBTT (Target Short Beacon + * Transmission Time) that is not a TBTT. Additionally, short beacons are + * prohibited from containing the S1G beacon compatibility element as per + * IEEE80211-2024 9.3.4.3 Table 9-76, so if we have an S1G beacon with + * either no elements or the first element is not the beacon compatibility + * element, we have a short beacon. + */ +static inline bool ieee80211_is_s1g_short_beacon(__le16 fc, const u8 *variable, + size_t variable_len) +{ + if (!ieee80211_is_s1g_beacon(fc)) + return false; + + /* + * If the frame does not contain at least 1 element (this is perfectly + * valid in a short beacon) and is an S1G beacon, we have a short + * beacon. + */ + if (variable_len < 2) + return true; + + return variable[0] != WLAN_EID_S1G_BCN_COMPAT; +} + +struct s1g_tim_aid { + u16 aid; + u8 target_blk; /* Target block index */ + u8 target_subblk; /* Target subblock index */ + u8 target_subblk_bit; /* Target subblock bit */ +}; + +struct s1g_tim_enc_block { + u8 enc_mode; + bool inverse; + const u8 *ptr; + u8 len; + + /* + * For an OLB encoded block that spans multiple blocks, this + * is the offset into the span described by that encoded block. + */ + u8 olb_blk_offset; +}; + +/* + * Helper routines to quickly extract the length of an encoded block. Validation + * is also performed to ensure the length extracted lies within the TIM. + */ + +static inline int ieee80211_s1g_len_bitmap(const u8 *ptr, const u8 *end) +{ + u8 blkmap; + u8 n_subblks; + + if (ptr >= end) + return -EINVAL; + + blkmap = *ptr; + n_subblks = hweight8(blkmap); + + if (ptr + 1 + n_subblks > end) + return -EINVAL; + + return 1 + n_subblks; +} + +static inline int ieee80211_s1g_len_single(const u8 *ptr, const u8 *end) +{ + return (ptr + 1 > end) ? -EINVAL : 1; +} + +static inline int ieee80211_s1g_len_olb(const u8 *ptr, const u8 *end) +{ + if (ptr >= end) + return -EINVAL; + + return (ptr + 1 + *ptr > end) ? -EINVAL : 1 + *ptr; +} + +/* + * Enumerate all encoded blocks until we find the encoded block that describes + * our target AID. OLB is a special case as a single encoded block can describe + * multiple blocks as a single encoded block. + */ +static inline int ieee80211_s1g_find_target_block(struct s1g_tim_enc_block *enc, + const struct s1g_tim_aid *aid, + const u8 *ptr, const u8 *end) +{ + /* need at least block-control octet */ + while (ptr + 1 <= end) { + u8 ctrl = *ptr++; + u8 mode = ctrl & 0x03; + bool contains, inverse = ctrl & BIT(2); + u8 span, blk_off = ctrl >> 3; + int len; + + switch (mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: + len = ieee80211_s1g_len_bitmap(ptr, end); + contains = blk_off == aid->target_blk; + break; + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: + len = ieee80211_s1g_len_single(ptr, end); + contains = blk_off == aid->target_blk; + break; + case IEEE80211_S1G_TIM_ENC_MODE_OLB: + len = ieee80211_s1g_len_olb(ptr, end); + /* + * An OLB encoded block can describe more then one + * block, meaning an encoded OLB block can span more + * then a single block. + */ + if (len > 0) { + /* Minus one for the length octet */ + span = DIV_ROUND_UP(len - 1, 8); + /* + * Check if our target block lies within the + * block span described by this encoded block. + */ + contains = (aid->target_blk >= blk_off) && + (aid->target_blk < blk_off + span); + } + break; + default: + return -EOPNOTSUPP; + } + + if (len < 0) + return len; + + if (contains) { + enc->enc_mode = mode; + enc->inverse = inverse; + enc->ptr = ptr; + enc->len = (u8)len; + enc->olb_blk_offset = blk_off; + return 0; + } + + ptr += len; + } + + return -ENOENT; +} + +static inline bool ieee80211_s1g_parse_bitmap(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + const u8 *ptr = enc->ptr; + u8 blkmap = *ptr++; + + /* + * If our block bitmap does not contain a set bit that corresponds + * to our AID, it could mean a variety of things depending on if + * the encoding mode is inverted or not. + * + * 1. If inverted, it means the entire subblock is present and hence + * our AID has been set. + * 2. If not inverted, it means our subblock is not present and hence + * it is all zero meaning our AID is not set. + */ + if (!(blkmap & BIT(aid->target_subblk))) + return enc->inverse; + + /* + * Increment ptr by the number of set subblocks that appear before our + * target subblock. If our target subblock is 0, do nothing as ptr + * already points to our target subblock. + */ + if (aid->target_subblk) + ptr += hweight8(blkmap & GENMASK(aid->target_subblk - 1, 0)); + + return !!(*ptr & BIT(aid->target_subblk_bit)) ^ enc->inverse; +} + +static inline bool ieee80211_s1g_parse_single(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + /* + * Single AID mode describes, as the name suggests, a single AID + * within the block described by the encoded block. The octet + * contains the 6 LSBs of the AID described in the block. The other + * 2 bits are reserved. When inversed, every single AID described + * by the current block have buffered traffic except for the AID + * described in the single AID octet. + */ + return ((*enc->ptr & 0x3f) == (aid->aid & 0x3f)) ^ enc->inverse; +} + +static inline bool ieee80211_s1g_parse_olb(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + const u8 *ptr = enc->ptr; + u8 blk_len = *ptr++; + /* + * Given an OLB encoded block that describes multiple blocks, + * calculate the offset into the span. Then calculate the + * subblock location normally. + */ + u16 span_offset = aid->target_blk - enc->olb_blk_offset; + u16 subblk_idx = span_offset * 8 + aid->target_subblk; + + if (subblk_idx >= blk_len) + return enc->inverse; + + return !!(ptr[subblk_idx] & BIT(aid->target_subblk_bit)) ^ enc->inverse; +} + +/* + * An S1G PVB has 3 non optional encoding types, each that can be inverted. + * An S1G PVB is constructed with zero or more encoded block subfields. Each + * encoded block represents a single "block" of AIDs (64), and each encoded + * block can contain one of the 3 encoding types alongside a single bit for + * whether the bits should be inverted. + * + * As the standard makes no guarantee about the ordering of encoded blocks, + * we must parse every encoded block in the worst case scenario given an + * AID that lies within the last block. + */ +static inline bool ieee80211_s1g_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid) +{ + int err; + struct s1g_tim_aid target_aid; + struct s1g_tim_enc_block enc_blk; + + if (tim_len < 3) + return false; + + target_aid.aid = aid; + target_aid.target_blk = (aid >> 6) & 0x1f; + target_aid.target_subblk = (aid >> 3) & 0x7; + target_aid.target_subblk_bit = aid & 0x7; + + /* + * Find our AIDs target encoded block and fill &enc_blk with the + * encoded blocks information. If no entry is found or an error + * occurs return false. + */ + err = ieee80211_s1g_find_target_block(&enc_blk, &target_aid, + tim->virtual_map, + (const u8 *)tim + tim_len + 2); + if (err) + return false; + + switch (enc_blk.enc_mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: + return ieee80211_s1g_parse_bitmap(&enc_blk, &target_aid); + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: + return ieee80211_s1g_parse_single(&enc_blk, &target_aid); + case IEEE80211_S1G_TIM_ENC_MODE_OLB: + return ieee80211_s1g_parse_olb(&enc_blk, &target_aid); + default: + return false; + } +} + +#endif /* LINUX_IEEE80211_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 63a9775b059d..1b27bbac145b 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -109,17 +109,6 @@ #define IEEE80211_STYPE_DMG_BEACON 0x0000 #define IEEE80211_STYPE_S1G_BEACON 0x0010 -/* bits unique to S1G beacon */ -#define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 -#define IEEE80211_S1G_BCN_CSSID 0x200 -#define IEEE80211_S1G_BCN_ANO 0x400 - -/* see 802.11ah-2016 9.9 NDP CMAC frames */ -#define IEEE80211_S1G_1MHZ_NDP_BITS 25 -#define IEEE80211_S1G_1MHZ_NDP_BYTES 4 -#define IEEE80211_S1G_2MHZ_NDP_BITS 37 -#define IEEE80211_S1G_2MHZ_NDP_BYTES 5 - #define IEEE80211_NDP_FTYPE_CTS 0 #define IEEE80211_NDP_FTYPE_CF_END 0 #define IEEE80211_NDP_FTYPE_PS_POLL 1 @@ -221,11 +210,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_TIM_LEN 251 #define IEEE80211_MAX_MESH_PEERINGS 63 -/* S1G encoding types */ -#define IEEE80211_S1G_TIM_ENC_MODE_BLOCK 0 -#define IEEE80211_S1G_TIM_ENC_MODE_SINGLE 1 -#define IEEE80211_S1G_TIM_ENC_MODE_OLB 2 - /* Maximum size for the MA-UNITDATA primitive, 802.11 standard section 6.2.1.1.2. @@ -604,55 +588,6 @@ static inline bool ieee80211_is_beacon(__le16 fc) cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); } -/** - * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT && - * IEEE80211_STYPE_S1G_BEACON - * @fc: frame control bytes in little-endian byteorder - * Return: whether or not the frame is an S1G beacon - */ -static inline bool ieee80211_is_s1g_beacon(__le16 fc) -{ - return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | - IEEE80211_FCTL_STYPE)) == - cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON); -} - -/** - * ieee80211_s1g_has_next_tbtt - check if IEEE80211_S1G_BCN_NEXT_TBTT - * @fc: frame control bytes in little-endian byteorder - * Return: whether or not the frame contains the variable-length - * next TBTT field - */ -static inline bool ieee80211_s1g_has_next_tbtt(__le16 fc) -{ - return ieee80211_is_s1g_beacon(fc) && - (fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT)); -} - -/** - * ieee80211_s1g_has_ano - check if IEEE80211_S1G_BCN_ANO - * @fc: frame control bytes in little-endian byteorder - * Return: whether or not the frame contains the variable-length - * ANO field - */ -static inline bool ieee80211_s1g_has_ano(__le16 fc) -{ - return ieee80211_is_s1g_beacon(fc) && - (fc & cpu_to_le16(IEEE80211_S1G_BCN_ANO)); -} - -/** - * ieee80211_s1g_has_cssid - check if IEEE80211_S1G_BCN_CSSID - * @fc: frame control bytes in little-endian byteorder - * Return: whether or not the frame contains the variable-length - * compressed SSID field - */ -static inline bool ieee80211_s1g_has_cssid(__le16 fc) -{ - return ieee80211_is_s1g_beacon(fc) && - (fc & cpu_to_le16(IEEE80211_S1G_BCN_CSSID)); -} - /** * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM * @fc: frame control bytes in little-endian byteorder @@ -984,37 +919,6 @@ struct ieee80211_tim_ie { }; } __packed; -/** - * enum ieee80211_s1g_chanwidth - S1G channel widths - * These are defined in IEEE802.11-2016ah Table 10-20 - * as BSS Channel Width - * - * @IEEE80211_S1G_CHANWIDTH_1MHZ: 1MHz operating channel - * @IEEE80211_S1G_CHANWIDTH_2MHZ: 2MHz operating channel - * @IEEE80211_S1G_CHANWIDTH_4MHZ: 4MHz operating channel - * @IEEE80211_S1G_CHANWIDTH_8MHZ: 8MHz operating channel - * @IEEE80211_S1G_CHANWIDTH_16MHZ: 16MHz operating channel - */ -enum ieee80211_s1g_chanwidth { - IEEE80211_S1G_CHANWIDTH_1MHZ = 0, - IEEE80211_S1G_CHANWIDTH_2MHZ = 1, - IEEE80211_S1G_CHANWIDTH_4MHZ = 3, - IEEE80211_S1G_CHANWIDTH_8MHZ = 7, - IEEE80211_S1G_CHANWIDTH_16MHZ = 15, -}; - -/** - * enum ieee80211_s1g_pri_chanwidth - S1G primary channel widths - * described in IEEE80211-2024 Table 10-39. - * - * @IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: 2MHz primary channel - * @IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: 1MHz primary channel - */ -enum ieee80211_s1g_pri_chanwidth { - IEEE80211_S1G_PRI_CHANWIDTH_2MHZ = 0, - IEEE80211_S1G_PRI_CHANWIDTH_1MHZ = 1, -}; - #define WLAN_SA_QUERY_TR_ID_LEN 2 #define WLAN_MEMBERSHIP_LEN 8 #define WLAN_USER_POSITION_LEN 16 @@ -1042,61 +946,6 @@ struct ieee80211_addba_ext_ie { u8 data; } __packed; -/** - * struct ieee80211_s1g_bcn_compat_ie - S1G Beacon Compatibility element - * @compat_info: Compatibility Information - * @beacon_int: Beacon Interval - * @tsf_completion: TSF Completion - * - * This structure represents the payload of the "S1G Beacon - * Compatibility element" as described in IEEE Std 802.11-2020 section - * 9.4.2.196. - */ -struct ieee80211_s1g_bcn_compat_ie { - __le16 compat_info; - __le16 beacon_int; - __le32 tsf_completion; -} __packed; - -/** - * struct ieee80211_s1g_oper_ie - S1G Operation element - * @ch_width: S1G Operation Information Channel Width - * @oper_class: S1G Operation Information Operating Class - * @primary_ch: S1G Operation Information Primary Channel Number - * @oper_ch: S1G Operation Information Channel Center Frequency - * @basic_mcs_nss: Basic S1G-MCS and NSS Set - * - * This structure represents the payload of the "S1G Operation - * element" as described in IEEE Std 802.11-2020 section 9.4.2.212. - */ -struct ieee80211_s1g_oper_ie { - u8 ch_width; - u8 oper_class; - u8 primary_ch; - u8 oper_ch; - __le16 basic_mcs_nss; -} __packed; - -/** - * struct ieee80211_aid_response_ie - AID Response element - * @aid: AID/Group AID - * @switch_count: AID Switch Count - * @response_int: AID Response Interval - * - * This structure represents the payload of the "AID Response element" - * as described in IEEE Std 802.11-2020 section 9.4.2.194. - */ -struct ieee80211_aid_response_ie { - __le16 aid; - u8 switch_count; - __le16 response_int; -} __packed; - -struct ieee80211_s1g_cap { - u8 capab_info[10]; - u8 supp_mcs_nss[5]; -} __packed; - struct ieee80211_ext { __le16 frame_control; __le16 duration; @@ -1110,37 +959,6 @@ struct ieee80211_ext { } u; } __packed __aligned(2); -/** - * ieee80211_s1g_optional_len - determine length of optional S1G beacon fields - * @fc: frame control bytes in little-endian byteorder - * Return: total length in bytes of the optional fixed-length fields - * - * S1G beacons may contain up to three optional fixed-length fields that - * precede the variable-length elements. Whether these fields are present - * is indicated by flags in the frame control field. - * - * From IEEE 802.11-2024 section 9.3.4.3: - * - Next TBTT field may be 0 or 3 bytes - * - Short SSID field may be 0 or 4 bytes - * - Access Network Options (ANO) field may be 0 or 1 byte - */ -static inline size_t -ieee80211_s1g_optional_len(__le16 fc) -{ - size_t len = 0; - - if (ieee80211_s1g_has_next_tbtt(fc)) - len += 3; - - if (ieee80211_s1g_has_cssid(fc)) - len += 4; - - if (ieee80211_s1g_has_ano(fc)) - len += 1; - - return len; -} - /** * struct ieee80211_bss_load_elem - BSS Load elemen * @@ -1567,98 +1385,6 @@ struct ieee80211_p2p_noa_attr { #define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F -/* S1G Capabilities Information field */ -#define IEEE80211_S1G_CAPABILITY_LEN 15 - -#define S1G_CAP0_S1G_LONG BIT(0) -#define S1G_CAP0_SGI_1MHZ BIT(1) -#define S1G_CAP0_SGI_2MHZ BIT(2) -#define S1G_CAP0_SGI_4MHZ BIT(3) -#define S1G_CAP0_SGI_8MHZ BIT(4) -#define S1G_CAP0_SGI_16MHZ BIT(5) -#define S1G_CAP0_SUPP_CH_WIDTH GENMASK(7, 6) - -#define S1G_SUPP_CH_WIDTH_2 0 -#define S1G_SUPP_CH_WIDTH_4 1 -#define S1G_SUPP_CH_WIDTH_8 2 -#define S1G_SUPP_CH_WIDTH_16 3 -#define S1G_SUPP_CH_WIDTH_MAX(cap) ((1 << FIELD_GET(S1G_CAP0_SUPP_CH_WIDTH, \ - cap[0])) << 1) - -#define S1G_CAP1_RX_LDPC BIT(0) -#define S1G_CAP1_TX_STBC BIT(1) -#define S1G_CAP1_RX_STBC BIT(2) -#define S1G_CAP1_SU_BFER BIT(3) -#define S1G_CAP1_SU_BFEE BIT(4) -#define S1G_CAP1_BFEE_STS GENMASK(7, 5) - -#define S1G_CAP2_SOUNDING_DIMENSIONS GENMASK(2, 0) -#define S1G_CAP2_MU_BFER BIT(3) -#define S1G_CAP2_MU_BFEE BIT(4) -#define S1G_CAP2_PLUS_HTC_VHT BIT(5) -#define S1G_CAP2_TRAVELING_PILOT GENMASK(7, 6) - -#define S1G_CAP3_RD_RESPONDER BIT(0) -#define S1G_CAP3_HT_DELAYED_BA BIT(1) -#define S1G_CAP3_MAX_MPDU_LEN BIT(2) -#define S1G_CAP3_MAX_AMPDU_LEN_EXP GENMASK(4, 3) -#define S1G_CAP3_MIN_MPDU_START GENMASK(7, 5) - -#define S1G_CAP4_UPLINK_SYNC BIT(0) -#define S1G_CAP4_DYNAMIC_AID BIT(1) -#define S1G_CAP4_BAT BIT(2) -#define S1G_CAP4_TIME_ADE BIT(3) -#define S1G_CAP4_NON_TIM BIT(4) -#define S1G_CAP4_GROUP_AID BIT(5) -#define S1G_CAP4_STA_TYPE GENMASK(7, 6) - -#define S1G_CAP5_CENT_AUTH_CONTROL BIT(0) -#define S1G_CAP5_DIST_AUTH_CONTROL BIT(1) -#define S1G_CAP5_AMSDU BIT(2) -#define S1G_CAP5_AMPDU BIT(3) -#define S1G_CAP5_ASYMMETRIC_BA BIT(4) -#define S1G_CAP5_FLOW_CONTROL BIT(5) -#define S1G_CAP5_SECTORIZED_BEAM GENMASK(7, 6) - -#define S1G_CAP6_OBSS_MITIGATION BIT(0) -#define S1G_CAP6_FRAGMENT_BA BIT(1) -#define S1G_CAP6_NDP_PS_POLL BIT(2) -#define S1G_CAP6_RAW_OPERATION BIT(3) -#define S1G_CAP6_PAGE_SLICING BIT(4) -#define S1G_CAP6_TXOP_SHARING_IMP_ACK BIT(5) -#define S1G_CAP6_VHT_LINK_ADAPT GENMASK(7, 6) - -#define S1G_CAP7_TACK_AS_PS_POLL BIT(0) -#define S1G_CAP7_DUP_1MHZ BIT(1) -#define S1G_CAP7_MCS_NEGOTIATION BIT(2) -#define S1G_CAP7_1MHZ_CTL_RESPONSE_PREAMBLE BIT(3) -#define S1G_CAP7_NDP_BFING_REPORT_POLL BIT(4) -#define S1G_CAP7_UNSOLICITED_DYN_AID BIT(5) -#define S1G_CAP7_SECTOR_TRAINING_OPERATION BIT(6) -#define S1G_CAP7_TEMP_PS_MODE_SWITCH BIT(7) - -#define S1G_CAP8_TWT_GROUPING BIT(0) -#define S1G_CAP8_BDT BIT(1) -#define S1G_CAP8_COLOR GENMASK(4, 2) -#define S1G_CAP8_TWT_REQUEST BIT(5) -#define S1G_CAP8_TWT_RESPOND BIT(6) -#define S1G_CAP8_PV1_FRAME BIT(7) - -#define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) - -#define S1G_OPER_CH_WIDTH_PRIMARY BIT(0) -#define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) -#define S1G_OPER_CH_PRIMARY_LOCATION BIT(5) - -#define S1G_2M_PRIMARY_LOCATION_LOWER 0 -#define S1G_2M_PRIMARY_LOCATION_UPPER 1 - -#define LISTEN_INT_USF GENMASK(15, 14) -#define LISTEN_INT_UI GENMASK(13, 0) - -#define IEEE80211_MAX_USF FIELD_MAX(LISTEN_INT_USF) -#define IEEE80211_MAX_UI FIELD_MAX(LISTEN_INT_UI) - /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -2189,20 +1915,6 @@ enum ieee80211_key_len { WLAN_KEY_LEN_BIP_GMAC_256 = 32, }; -enum ieee80211_s1g_actioncode { - WLAN_S1G_AID_SWITCH_REQUEST, - WLAN_S1G_AID_SWITCH_RESPONSE, - WLAN_S1G_SYNC_CONTROL, - WLAN_S1G_STA_INFO_ANNOUNCE, - WLAN_S1G_EDCA_PARAM_SET, - WLAN_S1G_EL_OPERATION, - WLAN_S1G_TWT_SETUP, - WLAN_S1G_TWT_TEARDOWN, - WLAN_S1G_SECT_GROUP_ID_LIST, - WLAN_S1G_SECT_ID_FEEDBACK, - WLAN_S1G_TWT_INFORMATION = 11, -}; - /* Radio measurement action codes as defined in IEEE 802.11-2024 - Table 9-470 */ enum ieee80211_radio_measurement_actioncode { WLAN_RM_ACTION_RADIO_MEASUREMENT_REQUEST = 0, @@ -2877,254 +2589,6 @@ static inline bool __ieee80211_check_tim(const struct ieee80211_tim_ie *tim, return !!(tim->virtual_map[index] & mask); } -struct s1g_tim_aid { - u16 aid; - u8 target_blk; /* Target block index */ - u8 target_subblk; /* Target subblock index */ - u8 target_subblk_bit; /* Target subblock bit */ -}; - -struct s1g_tim_enc_block { - u8 enc_mode; - bool inverse; - const u8 *ptr; - u8 len; - - /* - * For an OLB encoded block that spans multiple blocks, this - * is the offset into the span described by that encoded block. - */ - u8 olb_blk_offset; -}; - -/* - * Helper routines to quickly extract the length of an encoded block. Validation - * is also performed to ensure the length extracted lies within the TIM. - */ - -static inline int ieee80211_s1g_len_bitmap(const u8 *ptr, const u8 *end) -{ - u8 blkmap; - u8 n_subblks; - - if (ptr >= end) - return -EINVAL; - - blkmap = *ptr; - n_subblks = hweight8(blkmap); - - if (ptr + 1 + n_subblks > end) - return -EINVAL; - - return 1 + n_subblks; -} - -static inline int ieee80211_s1g_len_single(const u8 *ptr, const u8 *end) -{ - return (ptr + 1 > end) ? -EINVAL : 1; -} - -static inline int ieee80211_s1g_len_olb(const u8 *ptr, const u8 *end) -{ - if (ptr >= end) - return -EINVAL; - - return (ptr + 1 + *ptr > end) ? -EINVAL : 1 + *ptr; -} - -/* - * Enumerate all encoded blocks until we find the encoded block that describes - * our target AID. OLB is a special case as a single encoded block can describe - * multiple blocks as a single encoded block. - */ -static inline int ieee80211_s1g_find_target_block(struct s1g_tim_enc_block *enc, - const struct s1g_tim_aid *aid, - const u8 *ptr, const u8 *end) -{ - /* need at least block-control octet */ - while (ptr + 1 <= end) { - u8 ctrl = *ptr++; - u8 mode = ctrl & 0x03; - bool contains, inverse = ctrl & BIT(2); - u8 span, blk_off = ctrl >> 3; - int len; - - switch (mode) { - case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: - len = ieee80211_s1g_len_bitmap(ptr, end); - contains = blk_off == aid->target_blk; - break; - case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: - len = ieee80211_s1g_len_single(ptr, end); - contains = blk_off == aid->target_blk; - break; - case IEEE80211_S1G_TIM_ENC_MODE_OLB: - len = ieee80211_s1g_len_olb(ptr, end); - /* - * An OLB encoded block can describe more then one - * block, meaning an encoded OLB block can span more - * then a single block. - */ - if (len > 0) { - /* Minus one for the length octet */ - span = DIV_ROUND_UP(len - 1, 8); - /* - * Check if our target block lies within the - * block span described by this encoded block. - */ - contains = (aid->target_blk >= blk_off) && - (aid->target_blk < blk_off + span); - } - break; - default: - return -EOPNOTSUPP; - } - - if (len < 0) - return len; - - if (contains) { - enc->enc_mode = mode; - enc->inverse = inverse; - enc->ptr = ptr; - enc->len = (u8)len; - enc->olb_blk_offset = blk_off; - return 0; - } - - ptr += len; - } - - return -ENOENT; -} - -static inline bool ieee80211_s1g_parse_bitmap(struct s1g_tim_enc_block *enc, - struct s1g_tim_aid *aid) -{ - const u8 *ptr = enc->ptr; - u8 blkmap = *ptr++; - - /* - * If our block bitmap does not contain a set bit that corresponds - * to our AID, it could mean a variety of things depending on if - * the encoding mode is inverted or not. - * - * 1. If inverted, it means the entire subblock is present and hence - * our AID has been set. - * 2. If not inverted, it means our subblock is not present and hence - * it is all zero meaning our AID is not set. - */ - if (!(blkmap & BIT(aid->target_subblk))) - return enc->inverse; - - /* - * Increment ptr by the number of set subblocks that appear before our - * target subblock. If our target subblock is 0, do nothing as ptr - * already points to our target subblock. - */ - if (aid->target_subblk) - ptr += hweight8(blkmap & GENMASK(aid->target_subblk - 1, 0)); - - return !!(*ptr & BIT(aid->target_subblk_bit)) ^ enc->inverse; -} - -static inline bool ieee80211_s1g_parse_single(struct s1g_tim_enc_block *enc, - struct s1g_tim_aid *aid) -{ - /* - * Single AID mode describes, as the name suggests, a single AID - * within the block described by the encoded block. The octet - * contains the 6 LSBs of the AID described in the block. The other - * 2 bits are reserved. When inversed, every single AID described - * by the current block have buffered traffic except for the AID - * described in the single AID octet. - */ - return ((*enc->ptr & 0x3f) == (aid->aid & 0x3f)) ^ enc->inverse; -} - -static inline bool ieee80211_s1g_parse_olb(struct s1g_tim_enc_block *enc, - struct s1g_tim_aid *aid) -{ - const u8 *ptr = enc->ptr; - u8 blk_len = *ptr++; - /* - * Given an OLB encoded block that describes multiple blocks, - * calculate the offset into the span. Then calculate the - * subblock location normally. - */ - u16 span_offset = aid->target_blk - enc->olb_blk_offset; - u16 subblk_idx = span_offset * 8 + aid->target_subblk; - - if (subblk_idx >= blk_len) - return enc->inverse; - - return !!(ptr[subblk_idx] & BIT(aid->target_subblk_bit)) ^ enc->inverse; -} - -/* - * An S1G PVB has 3 non optional encoding types, each that can be inverted. - * An S1G PVB is constructed with zero or more encoded block subfields. Each - * encoded block represents a single "block" of AIDs (64), and each encoded - * block can contain one of the 3 encoding types alongside a single bit for - * whether the bits should be inverted. - * - * As the standard makes no guarantee about the ordering of encoded blocks, - * we must parse every encoded block in the worst case scenario given an - * AID that lies within the last block. - */ -static inline bool ieee80211_s1g_check_tim(const struct ieee80211_tim_ie *tim, - u8 tim_len, u16 aid) -{ - int err; - struct s1g_tim_aid target_aid; - struct s1g_tim_enc_block enc_blk; - - if (tim_len < 3) - return false; - - target_aid.aid = aid; - target_aid.target_blk = (aid >> 6) & 0x1f; - target_aid.target_subblk = (aid >> 3) & 0x7; - target_aid.target_subblk_bit = aid & 0x7; - - /* - * Find our AIDs target encoded block and fill &enc_blk with the - * encoded blocks information. If no entry is found or an error - * occurs return false. - */ - err = ieee80211_s1g_find_target_block(&enc_blk, &target_aid, - tim->virtual_map, - (const u8 *)tim + tim_len + 2); - if (err) - return false; - - switch (enc_blk.enc_mode) { - case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: - return ieee80211_s1g_parse_bitmap(&enc_blk, &target_aid); - case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: - return ieee80211_s1g_parse_single(&enc_blk, &target_aid); - case IEEE80211_S1G_TIM_ENC_MODE_OLB: - return ieee80211_s1g_parse_olb(&enc_blk, &target_aid); - default: - return false; - } -} - -/** - * ieee80211_check_tim - check if AID bit is set in TIM - * @tim: the TIM IE - * @tim_len: length of the TIM IE - * @aid: the AID to look for - * @s1g: whether the TIM is from an S1G PPDU - * Return: whether or not traffic is indicated in the TIM for the given AID - */ -static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, - u8 tim_len, u16 aid, bool s1g) -{ - return s1g ? ieee80211_s1g_check_tim(tim, tim_len, aid) : - __ieee80211_check_tim(tim, tim_len, aid); -} - /** * ieee80211_get_tdls_action - get TDLS action code * @skb: the skb containing the frame, length will not be checked @@ -3258,39 +2722,6 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb) return false; } -/** - * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon - * @fc: frame control bytes in little-endian byteorder - * @variable: pointer to the beacon frame elements - * @variable_len: length of the frame elements - * Return: whether or not the frame is an S1G short beacon. As per - * IEEE80211-2024 11.1.3.10.1, The S1G beacon compatibility element shall - * always be present as the first element in beacon frames generated at a - * TBTT (Target Beacon Transmission Time), so any frame not containing - * this element must have been generated at a TSBTT (Target Short Beacon - * Transmission Time) that is not a TBTT. Additionally, short beacons are - * prohibited from containing the S1G beacon compatibility element as per - * IEEE80211-2024 9.3.4.3 Table 9-76, so if we have an S1G beacon with - * either no elements or the first element is not the beacon compatibility - * element, we have a short beacon. - */ -static inline bool ieee80211_is_s1g_short_beacon(__le16 fc, const u8 *variable, - size_t variable_len) -{ - if (!ieee80211_is_s1g_beacon(fc)) - return false; - - /* - * If the frame does not contain at least 1 element (this is perfectly - * valid in a short beacon) and is an S1G beacon, we have a short - * beacon. - */ - if (variable_len < 2) - return true; - - return variable[0] != WLAN_EID_S1G_BCN_COMPAT; -} - struct element { u8 id; u8 datalen; @@ -3446,5 +2877,21 @@ struct ieee80211_tbtt_info_ge_11 { #include "ieee80211-he.h" #include "ieee80211-eht.h" #include "ieee80211-mesh.h" +#include "ieee80211-s1g.h" + +/** + * ieee80211_check_tim - check if AID bit is set in TIM + * @tim: the TIM IE + * @tim_len: length of the TIM IE + * @aid: the AID to look for + * @s1g: whether the TIM is from an S1G PPDU + * Return: whether or not traffic is indicated in the TIM for the given AID + */ +static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid, bool s1g) +{ + return s1g ? ieee80211_s1g_check_tim(tim, tim_len, aid) : + __ieee80211_check_tim(tim, tim_len, aid); +} #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From fcd42b909ba06737dfcda47f3a0a9718bd3ebf03 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:55 +0100 Subject: wifi: ieee80211: split P2P definitions out The ieee80211.h file has gotten very long, continue splitting it by putting P2P definitions into a separate file. Note that P2P isn't really even IEEE 802.11 but WFA. Link: https://patch.msgid.link/20251105153843.e47b2614e9d2.Id242f61da720e365f6b5d7a4a545fbbc2f1e92b4@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-p2p.h | 71 +++++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 53 +------------------------------- 2 files changed, 72 insertions(+), 52 deletions(-) create mode 100644 include/linux/ieee80211-p2p.h (limited to 'include/linux') diff --git a/include/linux/ieee80211-p2p.h b/include/linux/ieee80211-p2p.h new file mode 100644 index 000000000000..180891c11f08 --- /dev/null +++ b/include/linux/ieee80211-p2p.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * WFA P2P definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_P2P_H +#define LINUX_IEEE80211_P2P_H + +#include +/* + * Peer-to-Peer IE attribute related definitions. + */ +/* + * enum ieee80211_p2p_attr_id - identifies type of peer-to-peer attribute. + */ +enum ieee80211_p2p_attr_id { + IEEE80211_P2P_ATTR_STATUS = 0, + IEEE80211_P2P_ATTR_MINOR_REASON, + IEEE80211_P2P_ATTR_CAPABILITY, + IEEE80211_P2P_ATTR_DEVICE_ID, + IEEE80211_P2P_ATTR_GO_INTENT, + IEEE80211_P2P_ATTR_GO_CONFIG_TIMEOUT, + IEEE80211_P2P_ATTR_LISTEN_CHANNEL, + IEEE80211_P2P_ATTR_GROUP_BSSID, + IEEE80211_P2P_ATTR_EXT_LISTEN_TIMING, + IEEE80211_P2P_ATTR_INTENDED_IFACE_ADDR, + IEEE80211_P2P_ATTR_MANAGABILITY, + IEEE80211_P2P_ATTR_CHANNEL_LIST, + IEEE80211_P2P_ATTR_ABSENCE_NOTICE, + IEEE80211_P2P_ATTR_DEVICE_INFO, + IEEE80211_P2P_ATTR_GROUP_INFO, + IEEE80211_P2P_ATTR_GROUP_ID, + IEEE80211_P2P_ATTR_INTERFACE, + IEEE80211_P2P_ATTR_OPER_CHANNEL, + IEEE80211_P2P_ATTR_INVITE_FLAGS, + /* 19 - 220: Reserved */ + IEEE80211_P2P_ATTR_VENDOR_SPECIFIC = 221, + + IEEE80211_P2P_ATTR_MAX +}; + +/* Notice of Absence attribute - described in P2P spec 4.1.14 */ +/* Typical max value used here */ +#define IEEE80211_P2P_NOA_DESC_MAX 4 + +struct ieee80211_p2p_noa_desc { + u8 count; + __le32 duration; + __le32 interval; + __le32 start_time; +} __packed; + +struct ieee80211_p2p_noa_attr { + u8 index; + u8 oppps_ctwindow; + struct ieee80211_p2p_noa_desc desc[IEEE80211_P2P_NOA_DESC_MAX]; +} __packed; + +#define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) +#define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F + +#endif /* LINUX_IEEE80211_P2P_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 1b27bbac145b..fa0f7f917ce7 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1333,58 +1333,6 @@ struct ieee80211_tdls_data { } u; } __packed; -/* - * Peer-to-Peer IE attribute related definitions. - */ -/* - * enum ieee80211_p2p_attr_id - identifies type of peer-to-peer attribute. - */ -enum ieee80211_p2p_attr_id { - IEEE80211_P2P_ATTR_STATUS = 0, - IEEE80211_P2P_ATTR_MINOR_REASON, - IEEE80211_P2P_ATTR_CAPABILITY, - IEEE80211_P2P_ATTR_DEVICE_ID, - IEEE80211_P2P_ATTR_GO_INTENT, - IEEE80211_P2P_ATTR_GO_CONFIG_TIMEOUT, - IEEE80211_P2P_ATTR_LISTEN_CHANNEL, - IEEE80211_P2P_ATTR_GROUP_BSSID, - IEEE80211_P2P_ATTR_EXT_LISTEN_TIMING, - IEEE80211_P2P_ATTR_INTENDED_IFACE_ADDR, - IEEE80211_P2P_ATTR_MANAGABILITY, - IEEE80211_P2P_ATTR_CHANNEL_LIST, - IEEE80211_P2P_ATTR_ABSENCE_NOTICE, - IEEE80211_P2P_ATTR_DEVICE_INFO, - IEEE80211_P2P_ATTR_GROUP_INFO, - IEEE80211_P2P_ATTR_GROUP_ID, - IEEE80211_P2P_ATTR_INTERFACE, - IEEE80211_P2P_ATTR_OPER_CHANNEL, - IEEE80211_P2P_ATTR_INVITE_FLAGS, - /* 19 - 220: Reserved */ - IEEE80211_P2P_ATTR_VENDOR_SPECIFIC = 221, - - IEEE80211_P2P_ATTR_MAX -}; - -/* Notice of Absence attribute - described in P2P spec 4.1.14 */ -/* Typical max value used here */ -#define IEEE80211_P2P_NOA_DESC_MAX 4 - -struct ieee80211_p2p_noa_desc { - u8 count; - __le32 duration; - __le32 interval; - __le32 start_time; -} __packed; - -struct ieee80211_p2p_noa_attr { - u8 index; - u8 oppps_ctwindow; - struct ieee80211_p2p_noa_desc desc[IEEE80211_P2P_NOA_DESC_MAX]; -} __packed; - -#define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) -#define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F - /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -2878,6 +2826,7 @@ struct ieee80211_tbtt_info_ge_11 { #include "ieee80211-eht.h" #include "ieee80211-mesh.h" #include "ieee80211-s1g.h" +#include "ieee80211-p2p.h" /** * ieee80211_check_tim - check if AID bit is set in TIM -- cgit v1.2.3 From 60a3734192fa6909c48e33b0d212990ebaff54c4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:56 +0100 Subject: wifi: ieee80211: split NAN definitions out The ieee80211.h file has gotten very long, continue splitting it by putting NAN definitions into a separate file. Note that NAN isn't really even IEEE 802.11 but WFA. Link: https://patch.msgid.link/20251105153843.8da0e796dda2.I7b2ce11220b70e8794019501eabbf8afbaf431a6@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-nan.h | 35 +++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 18 +----------------- 2 files changed, 36 insertions(+), 17 deletions(-) create mode 100644 include/linux/ieee80211-nan.h (limited to 'include/linux') diff --git a/include/linux/ieee80211-nan.h b/include/linux/ieee80211-nan.h new file mode 100644 index 000000000000..d07959bf8a90 --- /dev/null +++ b/include/linux/ieee80211-nan.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * WFA NAN definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_NAN_H +#define LINUX_IEEE80211_NAN_H + +/* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */ +#define NAN_OP_MODE_PHY_MODE_VHT 0x01 +#define NAN_OP_MODE_PHY_MODE_HE 0x10 +#define NAN_OP_MODE_PHY_MODE_MASK 0x11 +#define NAN_OP_MODE_80P80MHZ 0x02 +#define NAN_OP_MODE_160MHZ 0x04 +#define NAN_OP_MODE_PNDL_SUPPRTED 0x08 + +/* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification + * Table 79 + */ +#define NAN_DEV_CAPA_DFS_OWNER 0x01 +#define NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED 0x02 +#define NAN_DEV_CAPA_SIM_NDP_RX_SUPPORTED 0x04 +#define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 +#define NAN_DEV_CAPA_S3_SUPPORTED 0x10 + +#endif /* LINUX_IEEE80211_NAN_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index fa0f7f917ce7..48ce05e1d203 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2803,23 +2803,6 @@ struct ieee80211_tbtt_info_ge_11 { struct ieee80211_rnr_mld_params mld_params; } __packed; -/* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */ -#define NAN_OP_MODE_PHY_MODE_VHT 0x01 -#define NAN_OP_MODE_PHY_MODE_HE 0x10 -#define NAN_OP_MODE_PHY_MODE_MASK 0x11 -#define NAN_OP_MODE_80P80MHZ 0x02 -#define NAN_OP_MODE_160MHZ 0x04 -#define NAN_OP_MODE_PNDL_SUPPRTED 0x08 - -/* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification - * Table 79 - */ -#define NAN_DEV_CAPA_DFS_OWNER 0x01 -#define NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED 0x02 -#define NAN_DEV_CAPA_SIM_NDP_RX_SUPPORTED 0x04 -#define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 -#define NAN_DEV_CAPA_S3_SUPPORTED 0x10 - #include "ieee80211-ht.h" #include "ieee80211-vht.h" #include "ieee80211-he.h" @@ -2827,6 +2810,7 @@ struct ieee80211_tbtt_info_ge_11 { #include "ieee80211-mesh.h" #include "ieee80211-s1g.h" #include "ieee80211-p2p.h" +#include "ieee80211-nan.h" /** * ieee80211_check_tim - check if AID bit is set in TIM -- cgit v1.2.3 From 68eb1b791ac8da7c3d03967143f1417e2978bf5e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 16:08:10 +0100 Subject: wifi: mac80211: pass frame type to element parsing This will be needed for UHR operation parsing, and we already pass whether or not the frame is an action frame, replace that by the full type. Note this fixes a few cases where 'false' was erroneously passed (mesh and TDLS) and removes ieee802_11_parse_elems_crc() as it's unused. Link: https://patch.msgid.link/20251105160810.a476d20a6e01.Ie659535f9357f2f9a3c73f8c059ccfc96bf93b54@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + net/mac80211/agg-rx.c | 7 +++++-- net/mac80211/ibss.c | 14 +++++++++----- net/mac80211/ieee80211_i.h | 21 ++++++--------------- net/mac80211/mesh.c | 26 ++++++++++++++++---------- net/mac80211/mesh_hwmp.c | 7 +++++-- net/mac80211/mesh_plink.c | 7 +++++-- net/mac80211/mlme.c | 45 ++++++++++++++++++++++++++++++++++----------- net/mac80211/parse.c | 30 +++++++++++++++++++++++++----- net/mac80211/scan.c | 6 +++++- net/mac80211/tdls.c | 12 +++++++++--- net/mac80211/tests/elems.c | 4 +++- 12 files changed, 123 insertions(+), 57 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 48ce05e1d203..6d4bc80caf96 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -43,6 +43,7 @@ #define IEEE80211_FCTL_VERS 0x0003 #define IEEE80211_FCTL_FTYPE 0x000c #define IEEE80211_FCTL_STYPE 0x00f0 +#define IEEE80211_FCTL_TYPE (IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE) #define IEEE80211_FCTL_TODS 0x0100 #define IEEE80211_FCTL_FROMDS 0x0200 #define IEEE80211_FCTL_MOREFRAGS 0x0400 diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index e38f46ffebfa..7da909d78c68 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ /** @@ -206,7 +206,10 @@ u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta, if (elem_len <= 0) return 0; - elems = ieee802_11_parse_elems(elem_data, elem_len, true, NULL); + elems = ieee802_11_parse_elems(elem_data, elem_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems || elems->parse_error || !elems->addba_ext_ie) goto free; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 6e36b09fe97f..168f84a1353b 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2024 Intel Corporation + * Copyright(c) 2018-2025 Intel Corporation */ #include @@ -1554,6 +1554,7 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, { size_t baselen; struct ieee802_11_elems *elems; + u16 type; BUILD_BUG_ON(offsetof(typeof(mgmt->u.probe_resp), variable) != offsetof(typeof(mgmt->u.beacon), variable)); @@ -1566,8 +1567,9 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; + type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE; elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable, - len - baselen, false, NULL); + len - baselen, type, NULL); if (elems) { ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, elems); @@ -1616,9 +1618,11 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, if (ies_len < 0) break; - elems = ieee802_11_parse_elems( - mgmt->u.action.u.chan_switch.variable, - ies_len, true, NULL); + elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable, + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems && !elems->parse_error) ieee80211_rx_mgmt_spectrum_mgmt(sdata, mgmt, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 898ccbc4ec64..7f1ce9fc01c7 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2422,7 +2422,8 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, * @mode: connection mode for parsing * @start: pointer to the elements * @len: length of the elements - * @action: %true if the elements came from an action frame + * @type: type of the frame the elements came from + * (action, probe response, beacon, etc.) * @filter: bitmap of element IDs to filter out while calculating * the element CRC * @crc: CRC starting value @@ -2440,7 +2441,7 @@ struct ieee80211_elems_parse_params { enum ieee80211_conn_mode mode; const u8 *start; size_t len; - bool action; + u8 type; u64 filter; u32 crc; struct cfg80211_bss *bss; @@ -2452,17 +2453,14 @@ struct ieee802_11_elems * ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params); static inline struct ieee802_11_elems * -ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, - u64 filter, u32 crc, - struct cfg80211_bss *bss) +ieee802_11_parse_elems(const u8 *start, size_t len, u8 type, + struct cfg80211_bss *bss) { struct ieee80211_elems_parse_params params = { .mode = IEEE80211_CONN_MODE_HIGHEST, .start = start, .len = len, - .action = action, - .filter = filter, - .crc = crc, + .type = type, .bss = bss, .link_id = -1, }; @@ -2470,13 +2468,6 @@ ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, return ieee802_11_parse_elems_full(¶ms); } -static inline struct ieee802_11_elems * -ieee802_11_parse_elems(const u8 *start, size_t len, bool action, - struct cfg80211_bss *bss) -{ - return ieee802_11_parse_elems_crc(start, len, action, 0, 0, bss); -} - extern const int ieee802_1d_to_ac[8]; static inline int ieee80211_ac_from_tid(int tid) diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index f37068a533f4..68901f1def0d 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018 - 2025 Intel Corporation * Authors: Luis Carlos Cobo * Javier Cardona */ @@ -1410,7 +1410,10 @@ ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; - elems = ieee802_11_parse_elems(pos, len - baselen, false, NULL); + elems = ieee802_11_parse_elems(pos, len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_PROBE_REQ, + NULL); if (!elems) return; @@ -1455,11 +1458,11 @@ free: } static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, - u16 stype, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { + u16 type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE; struct ieee80211_local *local = sdata->local; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee802_11_elems *elems; @@ -1469,7 +1472,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, enum nl80211_band band = rx_status->band; /* ignore ProbeResp to foreign address */ - if (stype == IEEE80211_STYPE_PROBE_RESP && + if (type == (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP) && !ether_addr_equal(mgmt->da, sdata->vif.addr)) return; @@ -1478,8 +1481,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, return; elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable, - len - baselen, - false, NULL); + len - baselen, type, NULL); if (!elems) return; @@ -1514,7 +1516,9 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, } if (ifmsh->sync_ops) - ifmsh->sync_ops->rx_bcn_presp(sdata, stype, mgmt, len, + ifmsh->sync_ops->rx_bcn_presp(sdata, + type & IEEE80211_FCTL_STYPE, + mgmt, len, elems->mesh_config, rx_status); free: kfree(elems); @@ -1622,7 +1626,10 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata, pos = mgmt->u.action.u.chan_switch.variable; baselen = offsetof(struct ieee80211_mgmt, u.action.u.chan_switch.variable); - elems = ieee802_11_parse_elems(pos, len - baselen, true, NULL); + elems = ieee802_11_parse_elems(pos, len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return; @@ -1699,8 +1706,7 @@ void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, switch (stype) { case IEEE80211_STYPE_PROBE_RESP: case IEEE80211_STYPE_BEACON: - ieee80211_mesh_rx_bcn_presp(sdata, stype, mgmt, skb->len, - rx_status); + ieee80211_mesh_rx_bcn_presp(sdata, mgmt, skb->len, rx_status); break; case IEEE80211_STYPE_PROBE_REQ: ieee80211_mesh_rx_probe_req(sdata, mgmt, skb->len); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 9101858525dd..a41b57bd11ff 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2023 Intel Corporation + * Copyright (C) 2019, 2021-2023, 2025 Intel Corporation * Author: Luis Carlos Cobo */ @@ -951,7 +951,10 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt; elems = ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable, - len - baselen, false, NULL); + len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return; diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index cb45a5d2009d..04c931cd2063 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2024 Intel Corporation + * Copyright (C) 2019, 2021-2025 Intel Corporation * Author: Luis Carlos Cobo */ #include @@ -1248,7 +1248,10 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; } - elems = ieee802_11_parse_elems(baseaddr, len - baselen, true, NULL); + elems = ieee802_11_parse_elems(baseaddr, len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems) { mesh_process_plink_frame(sdata, mgmt, elems, rx_status); kfree(elems); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2ee9eae89d05..804c3a95b7c6 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -999,6 +999,9 @@ ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, .from_ap = true, .start = ies->data, .len = ies->len, + .type = ies->from_beacon ? + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON : + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP, }; struct ieee802_11_elems *elems; struct ieee80211_supported_band *sband; @@ -5177,7 +5180,9 @@ static void ieee80211_epcs_teardown(struct ieee80211_sub_if_data *sdata) continue; } - elems = ieee802_11_parse_elems(ies->data, ies->len, false, + elems = ieee802_11_parse_elems(ies->data, ies->len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_BEACON, NULL); if (!elems) { rcu_read_unlock(); @@ -5223,6 +5228,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, .len = elem_len, .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id, .from_ap = true, + .type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE, }; bool is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; @@ -6356,6 +6362,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, .bss = NULL, .link_id = -1, .from_ap = true, + .type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE, }; struct ieee802_11_elems *elems; int ac; @@ -7264,7 +7271,9 @@ ieee80211_mgd_check_cross_link_csa(struct ieee80211_sub_if_data *sdata, (prof->sta_info_len - 1), len - (prof->sta_info_len - 1), - false, NULL); + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_BEACON, + NULL); /* memory allocation failed - let's hope that's transient */ if (!prof_elems) @@ -7368,6 +7377,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, .mode = link->u.mgd.conn.mode, .link_id = -1, .from_ap = true, + .type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE, }; lockdep_assert_wiphy(local->hw.wiphy); @@ -7970,7 +7980,10 @@ void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, ies_len = len - offsetof(struct ieee80211_mgmt, u.action.u.ttlm_req.variable); elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable, - ies_len, true, NULL); + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; @@ -8176,9 +8189,11 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, break; /* CSA IE cannot be overridden, no need for BSSID */ - elems = ieee802_11_parse_elems( - mgmt->u.action.u.chan_switch.variable, - ies_len, true, NULL); + elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable, + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src = @@ -8205,9 +8220,11 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, * extended CSA IE can't be overridden, no need for * BSSID */ - elems = ieee802_11_parse_elems( - mgmt->u.action.u.ext_chan_switch.variable, - ies_len, true, NULL); + elems = ieee802_11_parse_elems(mgmt->u.action.u.ext_chan_switch.variable, + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src; @@ -10985,7 +11002,10 @@ static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata, pos = scratch + sizeof(control); len -= sizeof(control); - link_elems = ieee802_11_parse_elems(pos, len, false, NULL); + link_elems = ieee802_11_parse_elems(pos, len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!link_elems) continue; @@ -11036,7 +11056,10 @@ void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, u.action.u.epcs.variable) - IEEE80211_EPCS_ENA_RESP_BODY_LEN; - elems = ieee802_11_parse_elems(pos, ies_len, true, NULL); + elems = ieee802_11_parse_elems(pos, ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return; diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index c5e0f7f46004..bfc4ecb7a048 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -6,7 +6,7 @@ * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * element parsing for mac80211 */ @@ -286,6 +286,24 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, bitmap_zero(seen_elems, 256); + switch (params->type) { + /* we don't need to parse assoc request, luckily (it's value 0) */ + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ: + default: + WARN(1, "invalid frame type 0x%x for element parsing\n", + params->type); + break; + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_RESP: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_RESP: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION: + case IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON: + break; + } + for_each_element(elem, params->start, params->len) { const struct element *subelem; u8 elem_parse_failed; @@ -566,7 +584,8 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, if (params->mode < IEEE80211_CONN_MODE_VHT) break; - if (!params->action) { + if (params->type != (IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION)) { elem_parse_failed = IEEE80211_PARSE_ERR_UNEXPECTED_ELEM; break; @@ -582,7 +601,8 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, case WLAN_EID_CHANNEL_SWITCH_WRAPPER: if (params->mode < IEEE80211_CONN_MODE_VHT) break; - if (params->action) { + if (params->type == (IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION)) { elem_parse_failed = IEEE80211_PARSE_ERR_UNEXPECTED_ELEM; break; @@ -942,7 +962,7 @@ ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse, sub->len = end - sub->start; sub->mode = params->mode; - sub->action = params->action; + sub->type = params->type; sub->from_ap = params->from_ap; sub->link_id = -1; @@ -1041,7 +1061,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) sub.start = elems_parse->scratch_pos; sub.mode = params->mode; sub.len = nontx_len; - sub.action = params->action; + sub.type = params->type; sub.link_id = params->link_id; /* consume the space used for non-transmitted profile */ diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index bb9563f50e7b..5ef315ed3b0f 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -76,7 +76,11 @@ void ieee80211_inform_bss(struct wiphy *wiphy, if (!update_data) return; - elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); + elems = ieee802_11_parse_elems(ies->data, ies->len, + update_data->beacon ? + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON : + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP, + NULL); if (!elems) return; diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index ba5fbacbeeda..dbbfe2d6842f 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -6,7 +6,7 @@ * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH * Copyright 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2019, 2021-2024 Intel Corporation + * Copyright (C) 2019, 2021-2025 Intel Corporation */ #include @@ -1783,7 +1783,10 @@ ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata, } elems = ieee802_11_parse_elems(tf->u.chan_switch_resp.variable, - skb->len - baselen, false, NULL); + skb->len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) { ret = -ENOMEM; goto out; @@ -1902,7 +1905,10 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, } elems = ieee802_11_parse_elems(tf->u.chan_switch_req.variable, - skb->len - baselen, false, NULL); + skb->len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return -ENOMEM; diff --git a/net/mac80211/tests/elems.c b/net/mac80211/tests/elems.c index a53c55a879a8..1039794a0183 100644 --- a/net/mac80211/tests/elems.c +++ b/net/mac80211/tests/elems.c @@ -2,7 +2,7 @@ /* * KUnit tests for element parsing * - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2025 Intel Corporation */ #include #include "../ieee80211_i.h" @@ -15,6 +15,8 @@ static void mle_defrag(struct kunit *test) .link_id = 12, .from_ap = true, .mode = IEEE80211_CONN_MODE_EHT, + /* type is not really relevant here */ + .type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON, }; struct ieee802_11_elems *parsed; struct sk_buff *skb; -- cgit v1.2.3 From aaa5abcc9d44d2c8484f779ab46d242d774cabcb Mon Sep 17 00:00:00 2001 From: Carl Worth Date: Thu, 25 Sep 2025 18:42:31 +0800 Subject: coresight: tmc: add the handle of the event to the path The handle is essential for retrieving the AUX_EVENT of each CPU and is required in perf mode. It has been added to the coresight_path so that dependent devices can access it from the path when needed. The existing bug can be reproduced with: perf record -e cs_etm//k -C 0-9 dd if=/dev/zero of=/dev/null Showing an oops as follows: Unable to handle kernel paging request at virtual address 000f6e84934ed19e Call trace: tmc_etr_get_buffer+0x30/0x80 [coresight_tmc] (P) catu_enable_hw+0xbc/0x3d0 [coresight_catu] catu_enable+0x70/0xe0 [coresight_catu] coresight_enable_path+0xb0/0x258 [coresight] Fixes: 080ee83cc361 ("Coresight: Change functions to accept the coresight_path") Signed-off-by: Carl Worth Reviewed-by: Leo Yan Co-developed-by: Jie Gan Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-1-edd8a07c1646@oss.qualcomm.com --- drivers/hwtracing/coresight/coresight-etm-perf.c | 1 + drivers/hwtracing/coresight/coresight-tmc-etr.c | 3 ++- include/linux/coresight.h | 10 ++++++---- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c index f677c08233ba..5c256af6e54a 100644 --- a/drivers/hwtracing/coresight/coresight-etm-perf.c +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c @@ -520,6 +520,7 @@ static void etm_event_start(struct perf_event *event, int flags) goto out; path = etm_event_cpu_path(event_data, cpu); + path->handle = handle; /* We need a sink, no need to continue without one */ sink = coresight_get_sink(path); if (WARN_ON_ONCE(!sink)) diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index 800be06598c1..60b0e0a6da05 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -1334,7 +1334,8 @@ out: struct etr_buf *tmc_etr_get_buffer(struct coresight_device *csdev, enum cs_mode mode, void *data) { - struct perf_output_handle *handle = data; + struct coresight_path *path = data; + struct perf_output_handle *handle = path->handle; struct etr_perf_buffer *etr_perf; switch (mode) { diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 6de59ce8ef8c..2626105e3719 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -332,12 +332,14 @@ static struct coresight_dev_list (var) = { \ /** * struct coresight_path - data needed by enable/disable path - * @path_list: path from source to sink. - * @trace_id: trace_id of the whole path. + * @path_list: path from source to sink. + * @trace_id: trace_id of the whole path. + * @handle: handle of the aux_event. */ struct coresight_path { - struct list_head path_list; - u8 trace_id; + struct list_head path_list; + u8 trace_id; + struct perf_output_handle *handle; }; enum cs_mode { -- cgit v1.2.3 From 94baedb51dea4b0c97e3c9acd90953bec98d03e7 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Thu, 25 Sep 2025 18:42:32 +0800 Subject: coresight: change helper_ops to accept coresight_path Update the helper_enable and helper_disable functions to accept coresight_path instead of a generic void *data, as coresight_path encapsulates all the necessary data required by devices along the path. Tested-by: Carl Worth Reviewed-by: Carl Worth Reviewed-by: Leo Yan Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-2-edd8a07c1646@oss.qualcomm.com --- drivers/hwtracing/coresight/coresight-catu.c | 10 +++++----- drivers/hwtracing/coresight/coresight-core.c | 20 ++++++++++++-------- drivers/hwtracing/coresight/coresight-ctcu-core.c | 9 +++------ drivers/hwtracing/coresight/coresight-cti-core.c | 5 +++-- drivers/hwtracing/coresight/coresight-cti.h | 5 +++-- drivers/hwtracing/coresight/coresight-tmc-etr.c | 4 ++-- drivers/hwtracing/coresight/coresight-tmc.h | 3 ++- include/linux/coresight.h | 5 +++-- 8 files changed, 33 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c index a3ccb7034ae1..69b36bae97ab 100644 --- a/drivers/hwtracing/coresight/coresight-catu.c +++ b/drivers/hwtracing/coresight/coresight-catu.c @@ -397,7 +397,7 @@ static int catu_wait_for_ready(struct catu_drvdata *drvdata) } static int catu_enable_hw(struct catu_drvdata *drvdata, enum cs_mode cs_mode, - void *data) + struct coresight_path *path) { int rc; u32 control, mode; @@ -425,7 +425,7 @@ static int catu_enable_hw(struct catu_drvdata *drvdata, enum cs_mode cs_mode, etrdev = coresight_find_input_type( csdev->pdata, CORESIGHT_DEV_TYPE_SINK, etr_subtype); if (etrdev) { - etr_buf = tmc_etr_get_buffer(etrdev, cs_mode, data); + etr_buf = tmc_etr_get_buffer(etrdev, cs_mode, path); if (IS_ERR(etr_buf)) return PTR_ERR(etr_buf); } @@ -455,7 +455,7 @@ static int catu_enable_hw(struct catu_drvdata *drvdata, enum cs_mode cs_mode, } static int catu_enable(struct coresight_device *csdev, enum cs_mode mode, - void *data) + struct coresight_path *path) { int rc = 0; struct catu_drvdata *catu_drvdata = csdev_to_catu_drvdata(csdev); @@ -463,7 +463,7 @@ static int catu_enable(struct coresight_device *csdev, enum cs_mode mode, guard(raw_spinlock_irqsave)(&catu_drvdata->spinlock); if (csdev->refcnt == 0) { CS_UNLOCK(catu_drvdata->base); - rc = catu_enable_hw(catu_drvdata, mode, data); + rc = catu_enable_hw(catu_drvdata, mode, path); CS_LOCK(catu_drvdata->base); } if (!rc) @@ -488,7 +488,7 @@ static int catu_disable_hw(struct catu_drvdata *drvdata) return rc; } -static int catu_disable(struct coresight_device *csdev, void *__unused) +static int catu_disable(struct coresight_device *csdev, struct coresight_path *path) { int rc = 0; struct catu_drvdata *catu_drvdata = csdev_to_catu_drvdata(csdev); diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c index 3267192f0c1c..f44ec9e5b692 100644 --- a/drivers/hwtracing/coresight/coresight-core.c +++ b/drivers/hwtracing/coresight/coresight-core.c @@ -355,17 +355,20 @@ static bool coresight_is_helper(struct coresight_device *csdev) } static int coresight_enable_helper(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { - return helper_ops(csdev)->enable(csdev, mode, data); + return helper_ops(csdev)->enable(csdev, mode, path); } -static void coresight_disable_helper(struct coresight_device *csdev, void *data) +static void coresight_disable_helper(struct coresight_device *csdev, + struct coresight_path *path) { - helper_ops(csdev)->disable(csdev, data); + helper_ops(csdev)->disable(csdev, path); } -static void coresight_disable_helpers(struct coresight_device *csdev, void *data) +static void coresight_disable_helpers(struct coresight_device *csdev, + struct coresight_path *path) { int i; struct coresight_device *helper; @@ -373,7 +376,7 @@ static void coresight_disable_helpers(struct coresight_device *csdev, void *data for (i = 0; i < csdev->pdata->nr_outconns; ++i) { helper = csdev->pdata->out_conns[i]->dest_dev; if (helper && coresight_is_helper(helper)) - coresight_disable_helper(helper, data); + coresight_disable_helper(helper, path); } } @@ -479,7 +482,8 @@ void coresight_disable_path(struct coresight_path *path) EXPORT_SYMBOL_GPL(coresight_disable_path); static int coresight_enable_helpers(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { int i, ret = 0; struct coresight_device *helper; @@ -489,7 +493,7 @@ static int coresight_enable_helpers(struct coresight_device *csdev, if (!helper || !coresight_is_helper(helper)) continue; - ret = coresight_enable_helper(helper, mode, data); + ret = coresight_enable_helper(helper, mode, path); if (ret) return ret; } diff --git a/drivers/hwtracing/coresight/coresight-ctcu-core.c b/drivers/hwtracing/coresight/coresight-ctcu-core.c index c586495e9a08..abed15eb72b4 100644 --- a/drivers/hwtracing/coresight/coresight-ctcu-core.c +++ b/drivers/hwtracing/coresight/coresight-ctcu-core.c @@ -156,17 +156,14 @@ static int ctcu_set_etr_traceid(struct coresight_device *csdev, struct coresight return __ctcu_set_etr_traceid(csdev, traceid, port_num, enable); } -static int ctcu_enable(struct coresight_device *csdev, enum cs_mode mode, void *data) +static int ctcu_enable(struct coresight_device *csdev, enum cs_mode mode, + struct coresight_path *path) { - struct coresight_path *path = (struct coresight_path *)data; - return ctcu_set_etr_traceid(csdev, path, true); } -static int ctcu_disable(struct coresight_device *csdev, void *data) +static int ctcu_disable(struct coresight_device *csdev, struct coresight_path *path) { - struct coresight_path *path = (struct coresight_path *)data; - return ctcu_set_etr_traceid(csdev, path, false); } diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c index 8fb30dd73fd2..bfbc365bb2ef 100644 --- a/drivers/hwtracing/coresight/coresight-cti-core.c +++ b/drivers/hwtracing/coresight/coresight-cti-core.c @@ -799,14 +799,15 @@ static void cti_pm_release(struct cti_drvdata *drvdata) } /** cti ect operations **/ -int cti_enable(struct coresight_device *csdev, enum cs_mode mode, void *data) +int cti_enable(struct coresight_device *csdev, enum cs_mode mode, + struct coresight_path *path) { struct cti_drvdata *drvdata = csdev_to_cti_drvdata(csdev); return cti_enable_hw(drvdata); } -int cti_disable(struct coresight_device *csdev, void *data) +int cti_disable(struct coresight_device *csdev, struct coresight_path *path) { struct cti_drvdata *drvdata = csdev_to_cti_drvdata(csdev); diff --git a/drivers/hwtracing/coresight/coresight-cti.h b/drivers/hwtracing/coresight/coresight-cti.h index 8362a47c939c..4f89091ee93f 100644 --- a/drivers/hwtracing/coresight/coresight-cti.h +++ b/drivers/hwtracing/coresight/coresight-cti.h @@ -216,8 +216,9 @@ int cti_add_connection_entry(struct device *dev, struct cti_drvdata *drvdata, const char *assoc_dev_name); struct cti_trig_con *cti_allocate_trig_con(struct device *dev, int in_sigs, int out_sigs); -int cti_enable(struct coresight_device *csdev, enum cs_mode mode, void *data); -int cti_disable(struct coresight_device *csdev, void *data); +int cti_enable(struct coresight_device *csdev, enum cs_mode mode, + struct coresight_path *path); +int cti_disable(struct coresight_device *csdev, struct coresight_path *path); void cti_write_all_hw_regs(struct cti_drvdata *drvdata); void cti_write_intack(struct device *dev, u32 ackval); void cti_write_single_reg(struct cti_drvdata *drvdata, int offset, u32 value); diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index 60b0e0a6da05..51c6f73dd15c 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -1332,9 +1332,9 @@ out: } struct etr_buf *tmc_etr_get_buffer(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { - struct coresight_path *path = data; struct perf_output_handle *handle = path->handle; struct etr_perf_buffer *etr_perf; diff --git a/drivers/hwtracing/coresight/coresight-tmc.h b/drivers/hwtracing/coresight/coresight-tmc.h index cbb4ba439158..95473d131032 100644 --- a/drivers/hwtracing/coresight/coresight-tmc.h +++ b/drivers/hwtracing/coresight/coresight-tmc.h @@ -442,7 +442,8 @@ struct coresight_device *tmc_etr_get_catu_device(struct tmc_drvdata *drvdata); void tmc_etr_set_catu_ops(const struct etr_buf_operations *catu); void tmc_etr_remove_catu_ops(void); struct etr_buf *tmc_etr_get_buffer(struct coresight_device *csdev, - enum cs_mode mode, void *data); + enum cs_mode mode, + struct coresight_path *path); extern const struct attribute_group coresight_etr_group; #endif diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2626105e3719..2bee2e3bb1c6 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -424,8 +424,9 @@ struct coresight_ops_source { */ struct coresight_ops_helper { int (*enable)(struct coresight_device *csdev, enum cs_mode mode, - void *data); - int (*disable)(struct coresight_device *csdev, void *data); + struct coresight_path *path); + int (*disable)(struct coresight_device *csdev, + struct coresight_path *path); }; -- cgit v1.2.3 From b139702a889692ec30702534ebb1ae2b11ed1cbf Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Thu, 25 Sep 2025 18:42:33 +0800 Subject: coresight: change the sink_ops to accept coresight_path Update the sink_enable functions to accept coresight_path instead of a generic void *data, as coresight_path encapsulates all the necessary data required by devices along the path. Tested-by: Carl Worth Reviewed-by: Carl Worth Reviewed-by: Leo Yan Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-3-edd8a07c1646@oss.qualcomm.com --- drivers/hwtracing/coresight/coresight-core.c | 10 +++++----- drivers/hwtracing/coresight/coresight-dummy.c | 2 +- drivers/hwtracing/coresight/coresight-etb10.c | 8 ++++---- drivers/hwtracing/coresight/coresight-etm-perf.c | 2 +- drivers/hwtracing/coresight/coresight-priv.h | 3 +-- drivers/hwtracing/coresight/coresight-sysfs.c | 2 +- drivers/hwtracing/coresight/coresight-tmc-etf.c | 10 ++++++---- drivers/hwtracing/coresight/coresight-tmc-etr.c | 10 ++++++---- drivers/hwtracing/coresight/coresight-tpiu.c | 2 +- drivers/hwtracing/coresight/coresight-trbe.c | 4 ++-- drivers/hwtracing/coresight/ultrasoc-smb.c | 9 +++++---- include/linux/coresight.h | 2 +- 12 files changed, 34 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c index f44ec9e5b692..c660cf8adb1c 100644 --- a/drivers/hwtracing/coresight/coresight-core.c +++ b/drivers/hwtracing/coresight/coresight-core.c @@ -300,9 +300,10 @@ unlock: EXPORT_SYMBOL_GPL(coresight_add_helper); static int coresight_enable_sink(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { - return sink_ops(csdev)->enable(csdev, mode, data); + return sink_ops(csdev)->enable(csdev, mode, path); } static void coresight_disable_sink(struct coresight_device *csdev) @@ -501,8 +502,7 @@ static int coresight_enable_helpers(struct coresight_device *csdev, return 0; } -int coresight_enable_path(struct coresight_path *path, enum cs_mode mode, - void *sink_data) +int coresight_enable_path(struct coresight_path *path, enum cs_mode mode) { int ret = 0; u32 type; @@ -532,7 +532,7 @@ int coresight_enable_path(struct coresight_path *path, enum cs_mode mode, switch (type) { case CORESIGHT_DEV_TYPE_SINK: - ret = coresight_enable_sink(csdev, mode, sink_data); + ret = coresight_enable_sink(csdev, mode, path); /* * Sink is the first component turned on. If we * failed to enable the sink, there are no components diff --git a/drivers/hwtracing/coresight/coresight-dummy.c b/drivers/hwtracing/coresight/coresight-dummy.c index aaa92b5081e3..14322c99e29d 100644 --- a/drivers/hwtracing/coresight/coresight-dummy.c +++ b/drivers/hwtracing/coresight/coresight-dummy.c @@ -52,7 +52,7 @@ static int dummy_source_trace_id(struct coresight_device *csdev, __maybe_unused } static int dummy_sink_enable(struct coresight_device *csdev, enum cs_mode mode, - void *data) + struct coresight_path *path) { dev_dbg(csdev->dev.parent, "Dummy sink enabled\n"); diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c index 35db1b6093d1..6657602d8f2e 100644 --- a/drivers/hwtracing/coresight/coresight-etb10.c +++ b/drivers/hwtracing/coresight/coresight-etb10.c @@ -167,13 +167,13 @@ out: return ret; } -static int etb_enable_perf(struct coresight_device *csdev, void *data) +static int etb_enable_perf(struct coresight_device *csdev, struct coresight_path *path) { int ret = 0; pid_t pid; unsigned long flags; struct etb_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); - struct perf_output_handle *handle = data; + struct perf_output_handle *handle = path->handle; struct cs_buffers *buf = etm_perf_sink_config(handle); raw_spin_lock_irqsave(&drvdata->spinlock, flags); @@ -224,7 +224,7 @@ out: } static int etb_enable(struct coresight_device *csdev, enum cs_mode mode, - void *data) + struct coresight_path *path) { int ret; @@ -233,7 +233,7 @@ static int etb_enable(struct coresight_device *csdev, enum cs_mode mode, ret = etb_enable_sysfs(csdev); break; case CS_MODE_PERF: - ret = etb_enable_perf(csdev, data); + ret = etb_enable_perf(csdev, path); break; default: ret = -EINVAL; diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c index 5c256af6e54a..17afa0f4cdee 100644 --- a/drivers/hwtracing/coresight/coresight-etm-perf.c +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c @@ -527,7 +527,7 @@ static void etm_event_start(struct perf_event *event, int flags) goto fail_end_stop; /* Nothing will happen without a path */ - if (coresight_enable_path(path, CS_MODE_PERF, handle)) + if (coresight_enable_path(path, CS_MODE_PERF)) goto fail_end_stop; /* Finally enable the tracer */ diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h index 33e22b1ba043..fd896ac07942 100644 --- a/drivers/hwtracing/coresight/coresight-priv.h +++ b/drivers/hwtracing/coresight/coresight-priv.h @@ -135,8 +135,7 @@ static inline void CS_UNLOCK(void __iomem *addr) } void coresight_disable_path(struct coresight_path *path); -int coresight_enable_path(struct coresight_path *path, enum cs_mode mode, - void *sink_data); +int coresight_enable_path(struct coresight_path *path, enum cs_mode mode); struct coresight_device *coresight_get_sink(struct coresight_path *path); struct coresight_device *coresight_get_sink_by_id(u32 id); struct coresight_device * diff --git a/drivers/hwtracing/coresight/coresight-sysfs.c b/drivers/hwtracing/coresight/coresight-sysfs.c index 5e52324aa9ac..d2a6ed8bcc74 100644 --- a/drivers/hwtracing/coresight/coresight-sysfs.c +++ b/drivers/hwtracing/coresight/coresight-sysfs.c @@ -215,7 +215,7 @@ int coresight_enable_sysfs(struct coresight_device *csdev) if (!IS_VALID_CS_TRACE_ID(path->trace_id)) goto err_path; - ret = coresight_enable_path(path, CS_MODE_SYSFS, NULL); + ret = coresight_enable_path(path, CS_MODE_SYSFS); if (ret) goto err_path; diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c index 0f45ab5e5249..8882b1c4cdc0 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etf.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c @@ -246,13 +246,14 @@ out: return ret; } -static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, void *data) +static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, + struct coresight_path *path) { int ret = 0; pid_t pid; unsigned long flags; struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); - struct perf_output_handle *handle = data; + struct perf_output_handle *handle = path->handle; struct cs_buffers *buf = etm_perf_sink_config(handle); raw_spin_lock_irqsave(&drvdata->spinlock, flags); @@ -304,7 +305,8 @@ static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, void *data) } static int tmc_enable_etf_sink(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { int ret; @@ -313,7 +315,7 @@ static int tmc_enable_etf_sink(struct coresight_device *csdev, ret = tmc_enable_etf_sink_sysfs(csdev); break; case CS_MODE_PERF: - ret = tmc_enable_etf_sink_perf(csdev, data); + ret = tmc_enable_etf_sink_perf(csdev, path); break; /* We shouldn't be here */ default: diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index 51c6f73dd15c..e0d83ee01b77 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -1733,13 +1733,14 @@ out: return size; } -static int tmc_enable_etr_sink_perf(struct coresight_device *csdev, void *data) +static int tmc_enable_etr_sink_perf(struct coresight_device *csdev, + struct coresight_path *path) { int rc = 0; pid_t pid; unsigned long flags; struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); - struct perf_output_handle *handle = data; + struct perf_output_handle *handle = path->handle; struct etr_perf_buffer *etr_perf = etm_perf_sink_config(handle); raw_spin_lock_irqsave(&drvdata->spinlock, flags); @@ -1787,13 +1788,14 @@ unlock_out: } static int tmc_enable_etr_sink(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { switch (mode) { case CS_MODE_SYSFS: return tmc_enable_etr_sink_sysfs(csdev); case CS_MODE_PERF: - return tmc_enable_etr_sink_perf(csdev, data); + return tmc_enable_etr_sink_perf(csdev, path); default: return -EINVAL; } diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c index 9463afdbda8a..aaa44bc521c3 100644 --- a/drivers/hwtracing/coresight/coresight-tpiu.c +++ b/drivers/hwtracing/coresight/coresight-tpiu.c @@ -75,7 +75,7 @@ static void tpiu_enable_hw(struct csdev_access *csa) } static int tpiu_enable(struct coresight_device *csdev, enum cs_mode mode, - void *__unused) + struct coresight_path *path) { struct tpiu_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index 43643d2c5bdd..293715b4ff0e 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -1013,11 +1013,11 @@ err: } static int arm_trbe_enable(struct coresight_device *csdev, enum cs_mode mode, - void *data) + struct coresight_path *path) { struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev); - struct perf_output_handle *handle = data; + struct perf_output_handle *handle = path->handle; struct trbe_buf *buf = etm_perf_sink_config(handle); WARN_ON(cpudata->cpu != smp_processor_id()); diff --git a/drivers/hwtracing/coresight/ultrasoc-smb.c b/drivers/hwtracing/coresight/ultrasoc-smb.c index 26cfc939e5bd..8f7922a5e534 100644 --- a/drivers/hwtracing/coresight/ultrasoc-smb.c +++ b/drivers/hwtracing/coresight/ultrasoc-smb.c @@ -213,10 +213,11 @@ static void smb_enable_sysfs(struct coresight_device *csdev) coresight_set_mode(csdev, CS_MODE_SYSFS); } -static int smb_enable_perf(struct coresight_device *csdev, void *data) +static int smb_enable_perf(struct coresight_device *csdev, + struct coresight_path *path) { struct smb_drv_data *drvdata = dev_get_drvdata(csdev->dev.parent); - struct perf_output_handle *handle = data; + struct perf_output_handle *handle = path->handle; struct cs_buffers *buf = etm_perf_sink_config(handle); pid_t pid; @@ -240,7 +241,7 @@ static int smb_enable_perf(struct coresight_device *csdev, void *data) } static int smb_enable(struct coresight_device *csdev, enum cs_mode mode, - void *data) + struct coresight_path *path) { struct smb_drv_data *drvdata = dev_get_drvdata(csdev->dev.parent); int ret = 0; @@ -261,7 +262,7 @@ static int smb_enable(struct coresight_device *csdev, enum cs_mode mode, smb_enable_sysfs(csdev); break; case CS_MODE_PERF: - ret = smb_enable_perf(csdev, data); + ret = smb_enable_perf(csdev, path); break; default: ret = -EINVAL; diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2bee2e3bb1c6..56d0108658db 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -367,7 +367,7 @@ enum cs_mode { */ struct coresight_ops_sink { int (*enable)(struct coresight_device *csdev, enum cs_mode mode, - void *data); + struct coresight_path *path); int (*disable)(struct coresight_device *csdev); void *(*alloc_buffer)(struct coresight_device *csdev, struct perf_event *event, void **pages, -- cgit v1.2.3 From 22ea7b9d96e26147b7a3ea1be7aa106cc700907c Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:11 +0100 Subject: platform/x86: asus-wmi: export symbols used for read/write WMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export symbols for reading/writing WMI symbols using a namespace. Existing functions: - asus_wmi_evaluate_method - asus_wmi_set_devstate New function: - asus_wmi_get_devstate_dsts The new function is intended for use with DSTS WMI method only and avoids requiring the asus_wmi driver data to select the WMI method. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/20251102215319.3126879-2-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-wmi.c | 46 ++++++++++++++++++++++++++++-- include/linux/platform_data/x86/asus-wmi.h | 5 ++++ 2 files changed, 48 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index e72a2b5d158e..c3e90517ce0f 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -390,7 +390,7 @@ int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval) { return asus_wmi_evaluate_method3(method_id, arg0, arg1, 0, retval); } -EXPORT_SYMBOL_GPL(asus_wmi_evaluate_method); +EXPORT_SYMBOL_NS_GPL(asus_wmi_evaluate_method, "ASUS_WMI"); static int asus_wmi_evaluate_method5(u32 method_id, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 *retval) @@ -554,12 +554,52 @@ static int asus_wmi_get_devstate(struct asus_wmi *asus, u32 dev_id, u32 *retval) return 0; } -int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, - u32 *retval) +/** + * asus_wmi_get_devstate_dsts() - Get the WMI function state. + * @dev_id: The WMI method ID to call. + * @retval: A pointer to where to store the value returned from WMI. + * + * Returns: + * * %-ENODEV - method ID is unsupported. + * * %0 - successful and retval is filled. + * * %other - error from WMI call. + */ +int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval) +{ + int err; + + err = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, dev_id, 0, retval); + if (err) + return err; + + if ((*retval & ASUS_WMI_DSTS_PRESENCE_BIT) == 0x00) + return -ENODEV; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(asus_wmi_get_devstate_dsts, "ASUS_WMI"); + +/** + * asus_wmi_set_devstate() - Set the WMI function state. + * + * Note: an asus_wmi_set_devstate() call must be paired with a + * asus_wmi_get_devstate_dsts() to check if the WMI function is supported. + * + * @dev_id: The WMI function to call. + * @ctrl_param: The argument to be used for this WMI function. + * @retval: A pointer to where to store the value returned from WMI. + * + * Returns: + * * %-ENODEV - method ID is unsupported. + * * %0 - successful and retval is filled. + * * %other - error from WMI call. + */ +int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval) { return asus_wmi_evaluate_method(ASUS_WMI_METHODID_DEVS, dev_id, ctrl_param, retval); } +EXPORT_SYMBOL_NS_GPL(asus_wmi_set_devstate, "ASUS_WMI"); /* Helper for special devices with magic return codes */ static int asus_wmi_get_devstate_bits(struct asus_wmi *asus, diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 8a515179113d..dbd44d9fbb6f 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -166,6 +166,7 @@ enum asus_ally_mcu_hack { #if IS_REACHABLE(CONFIG_ASUS_WMI) void set_ally_mcu_hack(enum asus_ally_mcu_hack status); void set_ally_mcu_powersave(bool enabled); +int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval); int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval); int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval); #else @@ -179,6 +180,10 @@ static inline int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval) { return -ENODEV; } +static inline int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval) +{ + return -ENODEV; +} static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval) { -- cgit v1.2.3 From 8989d328dfe7c7a3b9f4b9f0ef60006d277f81cc Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 5 Nov 2025 21:19:38 +0100 Subject: net: Helper to move packet data and metadata after skb_push/pull Lay groundwork for fixing BPF helpers available to TC(X) programs. When skb_push() or skb_pull() is called in a TC(X) ingress BPF program, the skb metadata must be kept in front of the MAC header. Otherwise, BPF programs using the __sk_buff->data_meta pseudo-pointer lose access to it. Introduce a helper that moves both metadata and a specified number of packet data bytes together, suitable as a drop-in replacement for memmove(). Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-1-5ceb08a9b37b@cloudflare.com --- include/linux/skbuff.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a7cc3d1f4fd1..ff90281ddf90 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4564,6 +4564,81 @@ static inline void skb_metadata_clear(struct sk_buff *skb) skb_metadata_set(skb, 0); } +/** + * skb_data_move - Move packet data and metadata after skb_push() or skb_pull(). + * @skb: packet to operate on + * @len: number of bytes pushed or pulled from &sk_buff->data + * @n: number of bytes to memmove() from pre-push/pull &sk_buff->data + * + * Moves @n bytes of packet data, can be zero, and all bytes of skb metadata. + * + * Assumes metadata is located immediately before &sk_buff->data prior to the + * push/pull, and that sufficient headroom exists to hold it after an + * skb_push(). Otherwise, metadata is cleared and a one-time warning is issued. + * + * Prefer skb_postpull_data_move() or skb_postpush_data_move() to calling this + * helper directly. + */ +static inline void skb_data_move(struct sk_buff *skb, const int len, + const unsigned int n) +{ + const u8 meta_len = skb_metadata_len(skb); + u8 *meta, *meta_end; + + if (!len || (!n && !meta_len)) + return; + + if (!meta_len) + goto no_metadata; + + meta_end = skb_metadata_end(skb); + meta = meta_end - meta_len; + + if (WARN_ON_ONCE(meta_end + len != skb->data || + meta_len > skb_headroom(skb))) { + skb_metadata_clear(skb); + goto no_metadata; + } + + memmove(meta + len, meta, meta_len + n); + return; + +no_metadata: + memmove(skb->data, skb->data - len, n); +} + +/** + * skb_postpull_data_move - Move packet data and metadata after skb_pull(). + * @skb: packet to operate on + * @len: number of bytes pulled from &sk_buff->data + * @n: number of bytes to memmove() from pre-pull &sk_buff->data + * + * See skb_data_move() for details. + */ +static inline void skb_postpull_data_move(struct sk_buff *skb, + const unsigned int len, + const unsigned int n) +{ + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb_data_move(skb, len, n); +} + +/** + * skb_postpush_data_move - Move packet data and metadata after skb_push(). + * @skb: packet to operate on + * @len: number of bytes pushed onto &sk_buff->data + * @n: number of bytes to memmove() from pre-push &sk_buff->data + * + * See skb_data_move() for details. + */ +static inline void skb_postpush_data_move(struct sk_buff *skb, + const unsigned int len, + const unsigned int n) +{ + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb_data_move(skb, -len, n); +} + struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING -- cgit v1.2.3 From f38499ff45f567c932d0911e6a30b8ca022b9b52 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 5 Nov 2025 21:19:40 +0100 Subject: bpf: Unclone skb head on bpf_dynptr_write to skb metadata Currently bpf_dynptr_from_skb_meta() marks the dynptr as read-only when the skb is cloned, preventing writes to metadata. Remove this restriction and unclone the skb head on bpf_dynptr_write() to metadata, now that the metadata is preserved during uncloning. This makes metadata dynptr consistent with skb dynptr, allowing writes regardless of whether the skb is cloned. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-3-5ceb08a9b37b@cloudflare.com --- include/linux/filter.h | 9 +++++++++ kernel/bpf/helpers.c | 6 ++---- net/core/filter.c | 18 ++++++++++++------ 3 files changed, 23 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index e116de7edc58..a104b3994230 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1781,6 +1781,8 @@ int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); +int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, u64 flags); void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, @@ -1817,6 +1819,13 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi { } +static inline int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, + u64 flags) +{ + return -EOPNOTSUPP; +} + static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) { return ERR_PTR(-EOPNOTSUPP); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index eb25e70e0bdc..3e830fd31f5f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1842,10 +1842,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); case BPF_DYNPTR_TYPE_SKB_META: - if (flags) - return -EINVAL; - memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); - return 0; + return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src, + len, flags); default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; diff --git a/net/core/filter.c b/net/core/filter.c index 52721efba332..673299fd3d58 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12102,6 +12102,18 @@ void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; } +int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, u64 flags) +{ + if (unlikely(flags)) + return -EINVAL; + if (unlikely(bpf_try_make_writable(skb, 0))) + return -EFAULT; + + memmove(bpf_skb_meta_pointer(skb, offset), from, len); + return 0; +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) @@ -12129,9 +12141,6 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to * &__sk_buff->data_meta. * - * If passed @skb_ is a clone which shares the data with the original, the - * dynptr will be read-only. This limitation may be lifted in the future. - * * Return: * * %0 - dynptr ready to use * * %-EINVAL - invalid flags, dynptr set to null @@ -12149,9 +12158,6 @@ __bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); - if (skb_cloned(skb)) - bpf_dynptr_set_rdonly(ptr); - return 0; } -- cgit v1.2.3 From b85be58e2f7cff47f7477ae61022644a198ee592 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 5 Nov 2025 21:19:41 +0100 Subject: vlan: Make vlan_remove_tag return nothing All callers ignore the return value. Prepare to reorder memmove() after skb_pull() which is a common pattern. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-4-5ceb08a9b37b@cloudflare.com --- include/linux/if_vlan.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 15e01935d3fa..afa5cc61a0fa 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -731,10 +731,8 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, * * Expects the skb to contain a VLAN tag in the payload, and to have skb->data * pointing at the MAC header. - * - * Returns: a new pointer to skb->data, or NULL on failure to pull. */ -static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) +static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); @@ -742,7 +740,7 @@ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); vlan_set_encap_proto(skb, vhdr); - return __skb_pull(skb, VLAN_HLEN); + __skb_pull(skb, VLAN_HLEN); } /** -- cgit v1.2.3 From efd35c26239bed39193201e958d65e695231ccda Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 5 Nov 2025 21:19:42 +0100 Subject: bpf: Make bpf_skb_vlan_pop helper metadata-safe Use the metadata-aware helper to move packet bytes after skb_pull(), ensuring metadata remains valid after calling the BPF helper. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-5-5ceb08a9b37b@cloudflare.com --- include/linux/if_vlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index afa5cc61a0fa..4ecc2509b0d4 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -738,9 +738,9 @@ static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) *vlan_tci = ntohs(vhdr->h_vlan_TCI); - memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); vlan_set_encap_proto(skb, vhdr); __skb_pull(skb, VLAN_HLEN); + skb_postpull_data_move(skb, VLAN_HLEN, 2 * ETH_ALEN); } /** -- cgit v1.2.3 From 55ffc98b44d28e13a218306666d16f2c7236d0ae Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 5 Nov 2025 21:19:43 +0100 Subject: bpf: Make bpf_skb_vlan_push helper metadata-safe Use the metadata-aware helper to move packet bytes after skb_push(), ensuring metadata remains valid after calling the BPF helper. Also, take care to reserve sufficient headroom for metadata to fit. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-6-5ceb08a9b37b@cloudflare.com --- include/linux/if_vlan.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 4ecc2509b0d4..f7f34eb15e06 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -355,16 +355,17 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { + const u8 meta_len = mac_len > ETH_TLEN ? skb_metadata_len(skb) : 0; struct vlan_ethhdr *veth; - if (skb_cow_head(skb, VLAN_HLEN) < 0) + if (skb_cow_head(skb, meta_len + VLAN_HLEN) < 0) return -ENOMEM; skb_push(skb, VLAN_HLEN); /* Move the mac header sans proto to the beginning of the new header. */ if (likely(mac_len > ETH_TLEN)) - memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); + skb_postpush_data_move(skb, VLAN_HLEN, mac_len - ETH_TLEN); if (skb_mac_header_was_set(skb)) skb->mac_header -= VLAN_HLEN; -- cgit v1.2.3 From 7ff14c52049eafecdd72cd0a12cae6905876566a Mon Sep 17 00:00:00 2001 From: Simon Schippers Date: Thu, 6 Nov 2025 18:56:15 +0100 Subject: usbnet: Add support for Byte Queue Limits (BQL) In the current implementation, usbnet uses a fixed tx_qlen of: USB2: 60 * 1518 bytes = 91.08 KB USB3: 60 * 5 * 1518 bytes = 454.80 KB Such large transmit queues can be problematic, especially for cellular modems. For example, with a typical celluar link speed of 10 Mbit/s, a fully occupied USB3 transmit queue results in: 454.80 KB / (10 Mbit/s / 8 bit/byte) = 363.84 ms of additional latency. This patch adds support for Byte Queue Limits (BQL) [1] to dynamically manage the transmit queue size and reduce latency without sacrificing throughput. Testing was performed on various devices using the usbnet driver for packet transmission: - DELOCK 66045: USB3 to 2.5 GbE adapter (ax88179_178a) - DELOCK 61969: USB2 to 1 GbE adapter (asix) - Quectel RM520: 5G modem (qmi_wwan) - USB2 Android tethering (cdc_ncm) No performance degradation was observed for iperf3 TCP or UDP traffic, while latency for a prioritized ping application was significantly reduced. For example, using the USB3 to 2.5 GbE adapter, which was fully utilized by iperf3 UDP traffic, the prioritized ping was improved from 1.6 ms to 0.6 ms. With the same setup but with a 100 Mbit/s Ethernet connection, the prioritized ping was improved from 35 ms to 5 ms. [1] https://lwn.net/Articles/469652/ Signed-off-by: Simon Schippers Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251106175615.26948-1-simon.schippers@tu-dortmund.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/usbnet.c | 11 +++++++++++ include/linux/usb/usbnet.h | 2 ++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index f3087fb62f4f..3d10cf791c51 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -831,6 +831,7 @@ int usbnet_stop(struct net_device *net) clear_bit(EVENT_DEV_OPEN, &dev->flags); netif_stop_queue (net); + netdev_reset_queue(net); netif_info(dev, ifdown, dev->net, "stop stats: rx/tx %lu/%lu, errs %lu/%lu\n", @@ -939,6 +940,7 @@ int usbnet_open(struct net_device *net) } set_bit(EVENT_DEV_OPEN, &dev->flags); + netdev_reset_queue(net); netif_start_queue (net); netif_info(dev, ifup, dev->net, "open: enable queueing (rx %d, tx %d) mtu %d %s framing\n", @@ -1500,6 +1502,7 @@ netdev_tx_t usbnet_start_xmit(struct sk_buff *skb, struct net_device *net) case 0: netif_trans_update(net); __usbnet_queue_skb(&dev->txq, skb, tx_start); + netdev_sent_queue(net, skb->len); if (dev->txq.qlen >= TX_QLEN (dev)) netif_stop_queue (net); } @@ -1563,6 +1566,7 @@ static inline void usb_free_skb(struct sk_buff *skb) static void usbnet_bh(struct timer_list *t) { struct usbnet *dev = timer_container_of(dev, t, delay); + unsigned int bytes_compl = 0, pkts_compl = 0; struct sk_buff *skb; struct skb_data *entry; @@ -1574,6 +1578,8 @@ static void usbnet_bh(struct timer_list *t) usb_free_skb(skb); continue; case tx_done: + bytes_compl += skb->len; + pkts_compl++; kfree(entry->urb->sg); fallthrough; case rx_cleanup: @@ -1584,6 +1590,10 @@ static void usbnet_bh(struct timer_list *t) } } + spin_lock_bh(&dev->bql_spinlock); + netdev_completed_queue(dev->net, pkts_compl, bytes_compl); + spin_unlock_bh(&dev->bql_spinlock); + /* restart RX again after disabling due to high error rate */ clear_bit(EVENT_RX_KILL, &dev->flags); @@ -1755,6 +1765,7 @@ usbnet_probe(struct usb_interface *udev, const struct usb_device_id *prod) skb_queue_head_init (&dev->txq); skb_queue_head_init (&dev->done); skb_queue_head_init(&dev->rxq_pause); + spin_lock_init(&dev->bql_spinlock); INIT_WORK(&dev->bh_work, usbnet_bh_work); INIT_WORK (&dev->kevent, usbnet_deferred_kevent); init_usb_anchor(&dev->deferred); diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index a2d54122823d..2945923a8a95 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -14,6 +14,7 @@ #include #include #include +#include /* interface from usbnet core to each USB networking link we handle */ struct usbnet { @@ -59,6 +60,7 @@ struct usbnet { struct mutex interrupt_mutex; struct usb_anchor deferred; struct work_struct bh_work; + spinlock_t bql_spinlock; struct work_struct kevent; unsigned long flags; -- cgit v1.2.3 From 2b9a0f21fbb8a3b7df7faa5b7534897a86c44b98 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:13 +0100 Subject: ns: move namespace types into separate header Add a dedicated header for namespace types. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-1-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns/ns_common_types.h | 205 +++++++++++++++++++++++++++++++++++++ include/linux/ns_common.h | 196 +---------------------------------- 2 files changed, 206 insertions(+), 195 deletions(-) create mode 100644 include/linux/ns/ns_common_types.h (limited to 'include/linux') diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h new file mode 100644 index 000000000000..ccd1d1e116f6 --- /dev/null +++ b/include/linux/ns/ns_common_types.h @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NS_COMMON_TYPES_H +#define _LINUX_NS_COMMON_TYPES_H + +#include +#include +#include +#include + +struct cgroup_namespace; +struct dentry; +struct ipc_namespace; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct proc_ns_operations; +struct time_namespace; +struct user_namespace; +struct uts_namespace; + +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations utsns_operations; + +/* + * Namespace lifetimes are managed via a two-tier reference counting model: + * + * (1) __ns_ref (refcount_t): Main reference count tracking memory + * lifetime. Controls when the namespace structure itself is freed. + * It also pins the namespace on the namespace trees whereas (2) + * only regulates their visibility to userspace. + * + * (2) __ns_ref_active (atomic_t): Reference count tracking active users. + * Controls visibility of the namespace in the namespace trees. + * Any live task that uses the namespace (via nsproxy or cred) holds + * an active reference. Any open file descriptor or bind-mount of + * the namespace holds an active reference. Once all tasks have + * called exited their namespaces and all file descriptors and + * bind-mounts have been released the active reference count drops + * to zero and the namespace becomes inactive. IOW, the namespace + * cannot be listed or opened via file handles anymore. + * + * Note that it is valid to transition from active to inactive and + * back from inactive to active e.g., when resurrecting an inactive + * namespace tree via the SIOCGSKNS ioctl(). + * + * Relationship and lifecycle states: + * + * - Active (__ns_ref_active > 0): + * Namespace is actively used and visible to userspace. The namespace + * can be reopened via /proc//ns/, via namespace file + * handles, or discovered via listns(). + * + * - Inactive (__ns_ref_active == 0, __ns_ref > 0): + * No tasks are actively using the namespace and it isn't pinned by + * any bind-mounts or open file descriptors anymore. But the namespace + * is still kept alive by internal references. For example, the user + * namespace could be pinned by an open file through file->f_cred + * references when one of the now defunct tasks had opened a file and + * handed the file descriptor off to another process via a UNIX + * sockets. Such references keep the namespace structure alive through + * __ns_ref but will not hold an active reference. + * + * - Destroyed (__ns_ref == 0): + * No references remain. The namespace is removed from the tree and freed. + * + * State transitions: + * + * Active -> Inactive: + * When the last task using the namespace exits it drops its active + * references to all namespaces. However, user and pid namespaces + * remain accessible until the task has been reaped. + * + * Inactive -> Active: + * An inactive namespace tree might be resurrected due to e.g., the + * SIOCGSKNS ioctl() on a socket. + * + * Inactive -> Destroyed: + * When __ns_ref drops to zero the namespace is removed from the + * namespaces trees and the memory is freed (after RCU grace period). + * + * Initial namespaces: + * Boot-time namespaces (init_net, init_pid_ns, etc.) start with + * __ns_ref_active = 1 and remain active forever. + */ +struct ns_common { + u32 ns_type; + struct dentry *stashed; + const struct proc_ns_operations *ops; + unsigned int inum; + refcount_t __ns_ref; /* do not use directly */ + union { + struct { + u64 ns_id; + struct /* global namespace rbtree and list */ { + struct rb_node ns_unified_tree_node; + struct list_head ns_unified_list_node; + }; + struct /* per type rbtree and list */ { + struct rb_node ns_tree_node; + struct list_head ns_list_node; + }; + struct /* namespace ownership rbtree and list */ { + struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */ + struct list_head ns_owner; /* list of namespaces owned by this namespace */ + struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */ + struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */ + }; + atomic_t __ns_ref_active; /* do not use directly */ + }; + struct rcu_head ns_rcu; + }; +}; + +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + const struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + const struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + const struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + const struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + const struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + const struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + const struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns, \ + const struct uts_namespace *: &(__ns)->ns) + +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define ns_init_id(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_ID, \ + struct ipc_namespace *: IPC_NS_INIT_ID, \ + struct mnt_namespace *: MNT_NS_INIT_ID, \ + struct net *: NET_NS_INIT_ID, \ + struct pid_namespace *: PID_NS_INIT_ID, \ + struct time_namespace *: TIME_NS_INIT_ID, \ + struct user_namespace *: USER_NS_INIT_ID, \ + struct uts_namespace *: UTS_NS_INIT_ID) + +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) + +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) + +#endif /* _LINUX_NS_COMMON_TYPES_H */ diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 66ea09b48377..6a4ca8c3b9c4 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -2,133 +2,12 @@ #ifndef _LINUX_NS_COMMON_H #define _LINUX_NS_COMMON_H +#include #include -#include #include #include #include -struct proc_ns_operations; - -struct cgroup_namespace; -struct ipc_namespace; -struct mnt_namespace; -struct net; -struct pid_namespace; -struct time_namespace; -struct user_namespace; -struct uts_namespace; - -extern struct cgroup_namespace init_cgroup_ns; -extern struct ipc_namespace init_ipc_ns; -extern struct mnt_namespace init_mnt_ns; -extern struct net init_net; -extern struct pid_namespace init_pid_ns; -extern struct time_namespace init_time_ns; -extern struct user_namespace init_user_ns; -extern struct uts_namespace init_uts_ns; - -extern const struct proc_ns_operations netns_operations; -extern const struct proc_ns_operations utsns_operations; -extern const struct proc_ns_operations ipcns_operations; -extern const struct proc_ns_operations pidns_operations; -extern const struct proc_ns_operations pidns_for_children_operations; -extern const struct proc_ns_operations userns_operations; -extern const struct proc_ns_operations mntns_operations; -extern const struct proc_ns_operations cgroupns_operations; -extern const struct proc_ns_operations timens_operations; -extern const struct proc_ns_operations timens_for_children_operations; - -/* - * Namespace lifetimes are managed via a two-tier reference counting model: - * - * (1) __ns_ref (refcount_t): Main reference count tracking memory - * lifetime. Controls when the namespace structure itself is freed. - * It also pins the namespace on the namespace trees whereas (2) - * only regulates their visibility to userspace. - * - * (2) __ns_ref_active (atomic_t): Reference count tracking active users. - * Controls visibility of the namespace in the namespace trees. - * Any live task that uses the namespace (via nsproxy or cred) holds - * an active reference. Any open file descriptor or bind-mount of - * the namespace holds an active reference. Once all tasks have - * called exited their namespaces and all file descriptors and - * bind-mounts have been released the active reference count drops - * to zero and the namespace becomes inactive. IOW, the namespace - * cannot be listed or opened via file handles anymore. - * - * Note that it is valid to transition from active to inactive and - * back from inactive to active e.g., when resurrecting an inactive - * namespace tree via the SIOCGSKNS ioctl(). - * - * Relationship and lifecycle states: - * - * - Active (__ns_ref_active > 0): - * Namespace is actively used and visible to userspace. The namespace - * can be reopened via /proc//ns/, via namespace file - * handles, or discovered via listns(). - * - * - Inactive (__ns_ref_active == 0, __ns_ref > 0): - * No tasks are actively using the namespace and it isn't pinned by - * any bind-mounts or open file descriptors anymore. But the namespace - * is still kept alive by internal references. For example, the user - * namespace could be pinned by an open file through file->f_cred - * references when one of the now defunct tasks had opened a file and - * handed the file descriptor off to another process via a UNIX - * sockets. Such references keep the namespace structure alive through - * __ns_ref but will not hold an active reference. - * - * - Destroyed (__ns_ref == 0): - * No references remain. The namespace is removed from the tree and freed. - * - * State transitions: - * - * Active -> Inactive: - * When the last task using the namespace exits it drops its active - * references to all namespaces. However, user and pid namespaces - * remain accessible until the task has been reaped. - * - * Inactive -> Active: - * An inactive namespace tree might be resurrected due to e.g., the - * SIOCGSKNS ioctl() on a socket. - * - * Inactive -> Destroyed: - * When __ns_ref drops to zero the namespace is removed from the - * namespaces trees and the memory is freed (after RCU grace period). - * - * Initial namespaces: - * Boot-time namespaces (init_net, init_pid_ns, etc.) start with - * __ns_ref_active = 1 and remain active forever. - */ -struct ns_common { - u32 ns_type; - struct dentry *stashed; - const struct proc_ns_operations *ops; - unsigned int inum; - refcount_t __ns_ref; /* do not use directly */ - union { - struct { - u64 ns_id; - struct /* global namespace rbtree and list */ { - struct rb_node ns_unified_tree_node; - struct list_head ns_unified_list_node; - }; - struct /* per type rbtree and list */ { - struct rb_node ns_tree_node; - struct list_head ns_list_node; - }; - struct /* namespace ownership rbtree and list */ { - struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */ - struct list_head ns_owner; /* list of namespaces owned by this namespace */ - struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */ - struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */ - }; - atomic_t __ns_ref_active; /* do not use directly */ - }; - struct rcu_head ns_rcu; - }; -}; - bool is_current_namespace(struct ns_common *ns); int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); @@ -147,79 +26,6 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) return ns->ns_id <= NS_LAST_INIT_ID; } -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns)->ns, \ - const struct cgroup_namespace *: &(__ns)->ns, \ - struct ipc_namespace *: &(__ns)->ns, \ - const struct ipc_namespace *: &(__ns)->ns, \ - struct mnt_namespace *: &(__ns)->ns, \ - const struct mnt_namespace *: &(__ns)->ns, \ - struct net *: &(__ns)->ns, \ - const struct net *: &(__ns)->ns, \ - struct pid_namespace *: &(__ns)->ns, \ - const struct pid_namespace *: &(__ns)->ns, \ - struct time_namespace *: &(__ns)->ns, \ - const struct time_namespace *: &(__ns)->ns, \ - struct user_namespace *: &(__ns)->ns, \ - const struct user_namespace *: &(__ns)->ns, \ - struct uts_namespace *: &(__ns)->ns, \ - const struct uts_namespace *: &(__ns)->ns) - -#define ns_init_inum(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ - struct ipc_namespace *: IPC_NS_INIT_INO, \ - struct mnt_namespace *: MNT_NS_INIT_INO, \ - struct net *: NET_NS_INIT_INO, \ - struct pid_namespace *: PID_NS_INIT_INO, \ - struct time_namespace *: TIME_NS_INIT_INO, \ - struct user_namespace *: USER_NS_INIT_INO, \ - struct uts_namespace *: UTS_NS_INIT_INO) - -#define ns_init_ns(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &init_cgroup_ns, \ - struct ipc_namespace *: &init_ipc_ns, \ - struct mnt_namespace *: &init_mnt_ns, \ - struct net *: &init_net, \ - struct pid_namespace *: &init_pid_ns, \ - struct time_namespace *: &init_time_ns, \ - struct user_namespace *: &init_user_ns, \ - struct uts_namespace *: &init_uts_ns) - -#define ns_init_id(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CGROUP_NS_INIT_ID, \ - struct ipc_namespace *: IPC_NS_INIT_ID, \ - struct mnt_namespace *: MNT_NS_INIT_ID, \ - struct net *: NET_NS_INIT_ID, \ - struct pid_namespace *: PID_NS_INIT_ID, \ - struct time_namespace *: TIME_NS_INIT_ID, \ - struct user_namespace *: USER_NS_INIT_ID, \ - struct uts_namespace *: UTS_NS_INIT_ID) - -#define to_ns_operations(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ - struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ - struct mnt_namespace *: &mntns_operations, \ - struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ - struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ - struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ - struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ - struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) - -#define ns_common_type(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CLONE_NEWCGROUP, \ - struct ipc_namespace *: CLONE_NEWIPC, \ - struct mnt_namespace *: CLONE_NEWNS, \ - struct net *: CLONE_NEWNET, \ - struct pid_namespace *: CLONE_NEWPID, \ - struct time_namespace *: CLONE_NEWTIME, \ - struct user_namespace *: CLONE_NEWUSER, \ - struct uts_namespace *: CLONE_NEWUTS) #define NS_COMMON_INIT(nsname, refs) \ { \ -- cgit v1.2.3 From ea1549e628ec51dcbea1d158301993364b754d75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:14 +0100 Subject: nstree: decouple from ns_common header Foward declare struct ns_common and remove the include of ns_common.h. We want ns_common.h to possibly include nstree structures but not the other way around. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-2-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/nstree.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 38674c6fa4f7..25040a98a92b 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -3,7 +3,6 @@ #ifndef _LINUX_NSTREE_H #define _LINUX_NSTREE_H -#include #include #include #include @@ -11,6 +10,8 @@ #include #include +struct ns_common; + extern struct ns_tree cgroup_ns_tree; extern struct ns_tree ipc_ns_tree; extern struct ns_tree mnt_ns_tree; -- cgit v1.2.3 From 1c64fb02ac46f5ca93ac9f5470f124921b4713b7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:15 +0100 Subject: nstree: move nstree types into separate header Introduce two new fundamental data structures for namespace tree management in a separate header file. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-3-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns/nstree_types.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/nstree.h | 1 + 2 files changed, 37 insertions(+) create mode 100644 include/linux/ns/nstree_types.h (limited to 'include/linux') diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h new file mode 100644 index 000000000000..6ee0c39686f8 --- /dev/null +++ b/include/linux/ns/nstree_types.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner */ +#ifndef _LINUX_NSTREE_TYPES_H +#define _LINUX_NSTREE_TYPES_H + +#include +#include + +/** + * struct ns_tree_root - Root of a namespace tree + * @ns_rb: Red-black tree root for efficient lookups + * @ns_list_head: List head for sequential iteration + * + * Each namespace tree maintains both an rbtree (for O(log n) lookups) + * and a list (for efficient sequential iteration). The list is kept in + * the same sorted order as the rbtree. + */ +struct ns_tree_root { + struct rb_root ns_rb; + struct list_head ns_list_head; +}; + +/** + * struct ns_tree_node - Node in a namespace tree + * @ns_node: Red-black tree node + * @ns_list_entry: List entry for sequential iteration + * + * Represents a namespace's position in a tree. Each namespace has + * multiple tree nodes for different trees (unified, per-type, owner). + */ +struct ns_tree_node { + struct rb_node ns_node; + struct list_head ns_list_entry; +}; + +#endif /* _LINUX_NSTREE_TYPES_H */ diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 25040a98a92b..0e275df7e99a 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -3,6 +3,7 @@ #ifndef _LINUX_NSTREE_H #define _LINUX_NSTREE_H +#include #include #include #include -- cgit v1.2.3 From d12ea8062fd31f02beeeb76a7884ab9bc4f5b197 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:16 +0100 Subject: nstree: add helper to operate on struct ns_tree_{node,root} Add helpers that work on the combined rbtree and rculist combined. This will make the code a lot more managable and legible. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-4-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/nstree.h | 8 +++++ kernel/nstree.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 0e275df7e99a..98b848cf2f1c 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -22,6 +22,14 @@ extern struct ns_tree time_ns_tree; extern struct ns_tree user_ns_tree; extern struct ns_tree uts_ns_tree; +void ns_tree_node_init(struct ns_tree_node *node); +void ns_tree_root_init(struct ns_tree_root *root); +bool ns_tree_node_empty(const struct ns_tree_node *node); +struct rb_node *ns_tree_node_add(struct ns_tree_node *node, + struct ns_tree_root *root, + int (*cmp)(struct rb_node *, const struct rb_node *)); +void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root); + #define to_ns_tree(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(cgroup_ns_tree), \ diff --git a/kernel/nstree.c b/kernel/nstree.c index 97404fb90749..fe71ff943f70 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -73,6 +73,91 @@ struct ns_tree time_ns_tree = { .type = CLONE_NEWTIME, }; +/** + * ns_tree_node_init - Initialize a namespace tree node + * @node: The node to initialize + * + * Initializes both the rbtree node and list entry. + */ +void ns_tree_node_init(struct ns_tree_node *node) +{ + RB_CLEAR_NODE(&node->ns_node); + INIT_LIST_HEAD(&node->ns_list_entry); +} + +/** + * ns_tree_root_init - Initialize a namespace tree root + * @root: The root to initialize + * + * Initializes both the rbtree root and list head. + */ +void ns_tree_root_init(struct ns_tree_root *root) +{ + root->ns_rb = RB_ROOT; + INIT_LIST_HEAD(&root->ns_list_head); +} + +/** + * ns_tree_node_empty - Check if a namespace tree node is empty + * @node: The node to check + * + * Returns true if the node is not in any tree. + */ +bool ns_tree_node_empty(const struct ns_tree_node *node) +{ + return RB_EMPTY_NODE(&node->ns_node); +} + +/** + * ns_tree_node_add - Add a node to a namespace tree + * @node: The node to add + * @root: The tree root to add to + * @cmp: Comparison function for rbtree insertion + * + * Adds the node to both the rbtree and the list, maintaining sorted order. + * The list is maintained in the same order as the rbtree to enable efficient + * iteration. + * + * Returns: NULL if insertion succeeded, existing node if duplicate found + */ +struct rb_node *ns_tree_node_add(struct ns_tree_node *node, + struct ns_tree_root *root, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node *ret, *prev; + + /* Add to rbtree */ + ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp); + + /* Add to list in sorted order */ + prev = rb_prev(&node->ns_node); + if (!prev) { + /* No previous node, add at head */ + list_add_rcu(&node->ns_list_entry, &root->ns_list_head); + } else { + /* Add after previous node */ + struct ns_tree_node *prev_node; + prev_node = rb_entry(prev, struct ns_tree_node, ns_node); + list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry); + } + + return ret; +} + +/** + * ns_tree_node_del - Remove a node from a namespace tree + * @node: The node to remove + * @root: The tree root to remove from + * + * Removes the node from both the rbtree and the list atomically. + */ +void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root) +{ + rb_erase(&node->ns_node, &root->ns_rb); + RB_CLEAR_NODE(&node->ns_node); + list_bidir_del_rcu(&node->ns_list_entry); +} + static inline struct ns_common *node_to_ns(const struct rb_node *node) { if (!node) -- cgit v1.2.3 From a657bc8a75cf40c3d0814fe6488ba4af56528f42 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:17 +0100 Subject: nstree: switch to new structures Switch the nstree management to the new combined structures. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-5-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- include/linux/ns/ns_common_types.h | 27 ++--- include/linux/ns/nstree_types.h | 19 ++++ include/linux/ns_common.h | 27 +++-- include/linux/nstree.h | 26 ++--- kernel/nscommon.c | 13 +-- kernel/nstree.c | 199 ++++++++++++++----------------------- 7 files changed, 136 insertions(+), 177 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index eded33eeb647..ad19530a13b2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -138,7 +138,7 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) if (!node) return NULL; - ns = rb_entry(node, struct ns_common, ns_tree_node); + ns = rb_entry(node, struct ns_common, ns_tree_node.ns_node); return container_of(ns, struct mnt_namespace, ns); } diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h index ccd1d1e116f6..b332b019b29c 100644 --- a/include/linux/ns/ns_common_types.h +++ b/include/linux/ns/ns_common_types.h @@ -3,6 +3,7 @@ #define _LINUX_NS_COMMON_TYPES_H #include +#include #include #include #include @@ -98,6 +99,13 @@ extern const struct proc_ns_operations utsns_operations; * Initial namespaces: * Boot-time namespaces (init_net, init_pid_ns, etc.) start with * __ns_ref_active = 1 and remain active forever. + * + * @ns_type: type of namespace (e.g., CLONE_NEWNET) + * @stashed: cached dentry to be used by the vfs + * @ops: namespace operations + * @inum: namespace inode number (quickly recycled for non-initial namespaces) + * @__ns_ref: main reference count (do not use directly) + * @ns_tree: namespace tree nodes and active reference count */ struct ns_common { u32 ns_type; @@ -106,24 +114,7 @@ struct ns_common { unsigned int inum; refcount_t __ns_ref; /* do not use directly */ union { - struct { - u64 ns_id; - struct /* global namespace rbtree and list */ { - struct rb_node ns_unified_tree_node; - struct list_head ns_unified_list_node; - }; - struct /* per type rbtree and list */ { - struct rb_node ns_tree_node; - struct list_head ns_list_node; - }; - struct /* namespace ownership rbtree and list */ { - struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */ - struct list_head ns_owner; /* list of namespaces owned by this namespace */ - struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */ - struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */ - }; - atomic_t __ns_ref_active; /* do not use directly */ - }; + struct ns_tree; struct rcu_head ns_rcu; }; }; diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h index 6ee0c39686f8..2fb28ee31efb 100644 --- a/include/linux/ns/nstree_types.h +++ b/include/linux/ns/nstree_types.h @@ -33,4 +33,23 @@ struct ns_tree_node { struct list_head ns_list_entry; }; +/** + * struct ns_tree - Namespace tree nodes and active reference count + * @ns_id: Unique namespace identifier + * @__ns_ref_active: Active reference count (do not use directly) + * @ns_unified_node: Node in the global namespace tree + * @ns_tree_node: Node in the per-type namespace tree + * @ns_owner_node: Node in the owner namespace's tree of owned namespaces + * @ns_owner_root: Root of the tree of namespaces owned by this namespace + * (only used when this namespace is an owner) + */ +struct ns_tree { + u64 ns_id; + atomic_t __ns_ref_active; + struct ns_tree_node ns_unified_node; + struct ns_tree_node ns_tree_node; + struct ns_tree_node ns_owner_node; + struct ns_tree_root ns_owner_root; +}; + #endif /* _LINUX_NSTREE_TYPES_H */ diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 6a4ca8c3b9c4..f90509ee0900 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -26,20 +26,19 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) return ns->ns_id <= NS_LAST_INIT_ID; } - -#define NS_COMMON_INIT(nsname, refs) \ -{ \ - .ns_type = ns_common_type(&nsname), \ - .ns_id = ns_init_id(&nsname), \ - .inum = ns_init_inum(&nsname), \ - .ops = to_ns_operations(&nsname), \ - .stashed = NULL, \ - .__ns_ref = REFCOUNT_INIT(refs), \ - .__ns_ref_active = ATOMIC_INIT(1), \ - .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ - .ns_owner_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_entry), \ - .ns_owner = LIST_HEAD_INIT(nsname.ns.ns_owner), \ - .ns_unified_list_node = LIST_HEAD_INIT(nsname.ns.ns_unified_list_node), \ +#define NS_COMMON_INIT(nsname, refs) \ +{ \ + .ns_type = ns_common_type(&nsname), \ + .ns_id = ns_init_id(&nsname), \ + .inum = ns_init_inum(&nsname), \ + .ops = to_ns_operations(&nsname), \ + .stashed = NULL, \ + .__ns_ref = REFCOUNT_INIT(refs), \ + .__ns_ref_active = ATOMIC_INIT(1), \ + .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \ + .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \ + .ns_owner_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry), \ + .ns_owner_root.ns_list_head = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head), \ } #define ns_common_init(__ns) \ diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 98b848cf2f1c..175e4625bfa6 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -13,14 +13,14 @@ struct ns_common; -extern struct ns_tree cgroup_ns_tree; -extern struct ns_tree ipc_ns_tree; -extern struct ns_tree mnt_ns_tree; -extern struct ns_tree net_ns_tree; -extern struct ns_tree pid_ns_tree; -extern struct ns_tree time_ns_tree; -extern struct ns_tree user_ns_tree; -extern struct ns_tree uts_ns_tree; +extern struct ns_tree_root cgroup_ns_tree; +extern struct ns_tree_root ipc_ns_tree; +extern struct ns_tree_root mnt_ns_tree; +extern struct ns_tree_root net_ns_tree; +extern struct ns_tree_root pid_ns_tree; +extern struct ns_tree_root time_ns_tree; +extern struct ns_tree_root user_ns_tree; +extern struct ns_tree_root uts_ns_tree; void ns_tree_node_init(struct ns_tree_node *node); void ns_tree_root_init(struct ns_tree_root *root); @@ -46,14 +46,14 @@ void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root); (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) u64 __ns_tree_gen_id(struct ns_common *ns, u64 id); -void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); -void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree); +void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree); struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, - struct ns_tree *ns_tree, + struct ns_tree_root *ns_tree, bool previous); -static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree, u64 id) +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id) { __ns_tree_gen_id(ns, id); __ns_tree_add_raw(ns, ns_tree); @@ -91,6 +91,6 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree, #define ns_tree_adjoined_rcu(__ns, __previous) \ __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) -#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) +#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node)) #endif /* _LINUX_NSTREE_H */ diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c910b979e433..88f70baccb75 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -2,6 +2,7 @@ /* Copyright (c) 2025 Christian Brauner */ #include +#include #include #include #include @@ -61,14 +62,10 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope ns->ops = ops; ns->ns_id = 0; ns->ns_type = ns_type; - RB_CLEAR_NODE(&ns->ns_tree_node); - RB_CLEAR_NODE(&ns->ns_unified_tree_node); - RB_CLEAR_NODE(&ns->ns_owner_tree_node); - INIT_LIST_HEAD(&ns->ns_list_node); - INIT_LIST_HEAD(&ns->ns_unified_list_node); - ns->ns_owner_tree = RB_ROOT; - INIT_LIST_HEAD(&ns->ns_owner); - INIT_LIST_HEAD(&ns->ns_owner_entry); + ns_tree_node_init(&ns->ns_tree_node); + ns_tree_node_init(&ns->ns_unified_node); + ns_tree_node_init(&ns->ns_owner_node); + ns_tree_root_init(&ns->ns_owner_root); #ifdef CONFIG_DEBUG_VFS ns_debug(ns, ops); diff --git a/kernel/nstree.c b/kernel/nstree.c index fe71ff943f70..6c7ec9fbf25f 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -9,68 +9,51 @@ #include static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); -static struct rb_root ns_unified_tree = RB_ROOT; /* protected by ns_tree_lock */ -static LIST_HEAD(ns_unified_list); /* protected by ns_tree_lock */ -/** - * struct ns_tree - Namespace tree - * @ns_tree: Rbtree of namespaces of a particular type - * @ns_list: Sequentially walkable list of all namespaces of this type - * @type: type of namespaces in this tree - */ -struct ns_tree { - struct rb_root ns_tree; - struct list_head ns_list; - int type; +static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */ + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head), }; -struct ns_tree mnt_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), - .type = CLONE_NEWNS, +struct ns_tree_root mnt_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head), }; -struct ns_tree net_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), - .type = CLONE_NEWNET, +struct ns_tree_root net_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head), }; EXPORT_SYMBOL_GPL(net_ns_tree); -struct ns_tree uts_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), - .type = CLONE_NEWUTS, +struct ns_tree_root uts_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head), }; -struct ns_tree user_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), - .type = CLONE_NEWUSER, +struct ns_tree_root user_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head), }; -struct ns_tree ipc_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), - .type = CLONE_NEWIPC, +struct ns_tree_root ipc_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head), }; -struct ns_tree pid_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), - .type = CLONE_NEWPID, +struct ns_tree_root pid_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head), }; -struct ns_tree cgroup_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), - .type = CLONE_NEWCGROUP, +struct ns_tree_root cgroup_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head), }; -struct ns_tree time_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), - .type = CLONE_NEWTIME, +struct ns_tree_root time_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head), }; /** @@ -162,21 +145,21 @@ static inline struct ns_common *node_to_ns(const struct rb_node *node) { if (!node) return NULL; - return rb_entry(node, struct ns_common, ns_tree_node); + return rb_entry(node, struct ns_common, ns_tree_node.ns_node); } static inline struct ns_common *node_to_ns_unified(const struct rb_node *node) { if (!node) return NULL; - return rb_entry(node, struct ns_common, ns_unified_tree_node); + return rb_entry(node, struct ns_common, ns_unified_node.ns_node); } static inline struct ns_common *node_to_ns_owner(const struct rb_node *node) { if (!node) return NULL; - return rb_entry(node, struct ns_common, ns_owner_tree_node); + return rb_entry(node, struct ns_common, ns_owner_node.ns_node); } static int ns_id_cmp(u64 id_a, u64 id_b) @@ -203,35 +186,22 @@ static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b) return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id); } -void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree) { - struct rb_node *node, *prev; + struct rb_node *node; const struct proc_ns_operations *ops = ns->ops; VFS_WARN_ON_ONCE(!ns->ns_id); - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); write_seqlock(&ns_tree_lock); - node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev = rb_prev(&ns->ns_tree_node); - if (!prev) - list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); - else - list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + /* Add to per-type tree and list */ + node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp); /* Add to unified tree and list */ - rb_find_add_rcu(&ns->ns_unified_tree_node, &ns_unified_tree, ns_cmp_unified); - prev = rb_prev(&ns->ns_unified_tree_node); - if (!prev) - list_add_rcu(&ns->ns_unified_list_node, &ns_unified_list); - else - list_add_rcu(&ns->ns_unified_list_node, &node_to_ns_unified(prev)->ns_unified_list_node); + ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified); + /* Add to owner's tree if applicable */ if (ops) { struct user_namespace *user_ns; @@ -241,15 +211,8 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) struct ns_common *owner = &user_ns->ns; VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); - /* Insert into owner's rbtree */ - rb_find_add_rcu(&ns->ns_owner_tree_node, &owner->ns_owner_tree, ns_cmp_owner); - - /* Insert into owner's list in sorted order */ - prev = rb_prev(&ns->ns_owner_tree_node); - if (!prev) - list_add_rcu(&ns->ns_owner_entry, &owner->ns_owner); - else - list_add_rcu(&ns->ns_owner_entry, &node_to_ns_owner(prev)->ns_owner_entry); + /* Insert into owner's tree and list */ + ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner); } else { /* Only the initial user namespace doesn't have an owner. */ VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns)); @@ -260,36 +223,29 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) VFS_WARN_ON_ONCE(node); } -void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree) { const struct proc_ns_operations *ops = ns->ops; struct user_namespace *user_ns; - VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); - VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry)); write_seqlock(&ns_tree_lock); - rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); - RB_CLEAR_NODE(&ns->ns_tree_node); - - list_bidir_del_rcu(&ns->ns_list_node); - rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree); - RB_CLEAR_NODE(&ns->ns_unified_tree_node); + /* Remove from per-type tree and list */ + ns_tree_node_del(&ns->ns_tree_node, ns_tree); - list_bidir_del_rcu(&ns->ns_unified_list_node); + /* Remove from unified tree and list */ + ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root); - /* Remove from owner's rbtree if this namespace has an owner */ + /* Remove from owner's tree if applicable */ if (ops) { user_ns = ops->owner(ns); if (user_ns) { struct ns_common *owner = &user_ns->ns; - rb_erase(&ns->ns_owner_tree_node, &owner->ns_owner_tree); - RB_CLEAR_NODE(&ns->ns_owner_tree_node); + ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root); } - - list_bidir_del_rcu(&ns->ns_owner_entry); } write_sequnlock(&ns_tree_lock); @@ -320,7 +276,7 @@ static int ns_find_unified(const void *key, const struct rb_node *node) return 0; } -static struct ns_tree *ns_tree_from_type(int ns_type) +static struct ns_tree_root *ns_tree_from_type(int ns_type) { switch (ns_type) { case CLONE_NEWCGROUP: @@ -351,7 +307,7 @@ static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id) do { seq = read_seqbegin(&ns_tree_lock); - node = rb_find_rcu(&ns_id, &ns_unified_tree, ns_find_unified); + node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified); if (node) break; } while (read_seqretry(&ns_tree_lock, seq)); @@ -361,7 +317,7 @@ static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id) static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type) { - struct ns_tree *ns_tree; + struct ns_tree_root *ns_tree; struct rb_node *node; unsigned int seq; @@ -371,7 +327,7 @@ static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type) do { seq = read_seqbegin(&ns_tree_lock); - node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find); if (node) break; } while (read_seqretry(&ns_tree_lock, seq)); @@ -399,22 +355,20 @@ struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) * there is no next/previous namespace, -ENOENT is returned. */ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, - struct ns_tree *ns_tree, bool previous) + struct ns_tree_root *ns_tree, bool previous) { struct list_head *list; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); if (previous) - list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry)); else - list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); - if (list_is_head(list, &ns_tree->ns_list)) + list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry)); + if (list_is_head(list, &ns_tree->ns_list_head)) return ERR_PTR(-ENOENT); - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); - - return list_entry_rcu(list, struct ns_common, ns_list_node); + return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry); } /** @@ -508,7 +462,7 @@ static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner) VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); read_seqlock_excl(&ns_tree_lock); - node = owner->ns_owner_tree.rb_node; + node = owner->ns_owner_root.ns_rb.rb_node; while (node) { struct ns_common *ns; @@ -638,16 +592,15 @@ static ssize_t do_listns_userns(struct klistns *kls) } ret = 0; - head = &to_ns_common(kls->user_ns)->ns_owner; + head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head; kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); rcu_read_lock(); if (!first_ns) - first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry); - - for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids; - ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) { + first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_node.ns_list_entry); + for (ns = first_ns; &ns->ns_owner_node.ns_list_entry != head && nr_ns_ids; + ns = list_entry_rcu(ns->ns_owner_node.ns_list_entry.next, typeof(*ns), ns_owner_node.ns_list_entry)) { struct ns_common *valid; valid = legitimize_ns(kls, ns); @@ -682,7 +635,7 @@ static ssize_t do_listns_userns(struct klistns *kls) static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) { struct ns_common *ret = NULL; - struct ns_tree *ns_tree = NULL; + struct ns_tree_root *ns_tree = NULL; struct rb_node *node; if (ns_type) { @@ -693,9 +646,9 @@ static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) read_seqlock_excl(&ns_tree_lock); if (ns_tree) - node = ns_tree->ns_tree.rb_node; + node = ns_tree->ns_rb.rb_node; else - node = ns_unified_tree.rb_node; + node = ns_unified_root.ns_rb.rb_node; while (node) { struct ns_common *ns; @@ -725,28 +678,28 @@ static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) } static inline struct ns_common *first_ns_common(const struct list_head *head, - struct ns_tree *ns_tree) + struct ns_tree_root *ns_tree) { if (ns_tree) - return list_entry_rcu(head->next, struct ns_common, ns_list_node); - return list_entry_rcu(head->next, struct ns_common, ns_unified_list_node); + return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry); } static inline struct ns_common *next_ns_common(struct ns_common *ns, - struct ns_tree *ns_tree) + struct ns_tree_root *ns_tree) { if (ns_tree) - return list_entry_rcu(ns->ns_list_node.next, struct ns_common, ns_list_node); - return list_entry_rcu(ns->ns_unified_list_node.next, struct ns_common, ns_unified_list_node); + return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry); } static inline bool ns_common_is_head(struct ns_common *ns, const struct list_head *head, - struct ns_tree *ns_tree) + struct ns_tree_root *ns_tree) { if (ns_tree) - return &ns->ns_list_node == head; - return &ns->ns_unified_list_node == head; + return &ns->ns_tree_node.ns_list_entry == head; + return &ns->ns_unified_node.ns_list_entry == head; } static ssize_t do_listns(struct klistns *kls) @@ -754,7 +707,7 @@ static ssize_t do_listns(struct klistns *kls) u64 __user *ns_ids = kls->uns_ids; size_t nr_ns_ids = kls->nr_ns_ids; struct ns_common *ns, *first_ns = NULL, *prev = NULL; - struct ns_tree *ns_tree = NULL; + struct ns_tree_root *ns_tree = NULL; const struct list_head *head; u32 ns_type; ssize_t ret; @@ -779,9 +732,9 @@ static ssize_t do_listns(struct klistns *kls) ret = 0; if (ns_tree) - head = &ns_tree->ns_list; + head = &ns_tree->ns_list_head; else - head = &ns_unified_list; + head = &ns_unified_root.ns_list_head; rcu_read_lock(); -- cgit v1.2.3 From ed93c0697a8dcb70972a77bca2522a6a23ba6658 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:20 +0100 Subject: ns: make is_initial_namespace() argument const We don't modify the data structure at all so pass it as const. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-8-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index f90509ee0900..7e4df96b7411 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -13,7 +13,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope void __ns_common_free(struct ns_common *ns); struct ns_common *__must_check ns_owner(struct ns_common *ns); -static __always_inline bool is_initial_namespace(struct ns_common *ns) +static __always_inline bool is_initial_namespace(const struct ns_common *ns) { VFS_WARN_ON_ONCE(ns->inum == 0); return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, -- cgit v1.2.3 From 6bf253855aa8c970d2191f87ee23f9f184ddaa79 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:21 +0100 Subject: ns: rename is_initial_namespace() Rename is_initial_namespace() to ns_init_inum() and make it symmetrical with the ns id variant. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-9-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 +- kernel/nscommon.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7e4df96b7411..b9e8f21a6984 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -13,7 +13,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope void __ns_common_free(struct ns_common *ns); struct ns_common *__must_check ns_owner(struct ns_common *ns); -static __always_inline bool is_initial_namespace(const struct ns_common *ns) +static __always_inline bool is_ns_init_inum(const struct ns_common *ns) { VFS_WARN_ON_ONCE(ns->inum == 0); return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 88f70baccb75..bdc3c86231d3 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -82,7 +82,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope * active use (installed in nsproxy) and decremented when all * active uses are gone. Initial namespaces are always active. */ - if (is_initial_namespace(ns)) + if (is_ns_init_inum(ns)) atomic_set(&ns->__ns_ref_active, 1); else atomic_set(&ns->__ns_ref_active, 0); -- cgit v1.2.3 From 657aeb436d70c66583cb2b5b6c65ca64bcf503a8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:24 +0100 Subject: ns: make all reference counts on initial namespace a nop They are always active so no need to needlessly cacheline ping-pong. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-12-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index b9e8f21a6984..5b8f2f0163d7 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -62,6 +62,8 @@ static __always_inline __must_check int __ns_ref_active_read(const struct ns_com static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { + if (is_ns_init_id(ns)) + return false; if (refcount_dec_and_test(&ns->__ns_ref)) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); return true; @@ -71,6 +73,8 @@ static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { + if (is_ns_init_id(ns)) + return true; if (refcount_inc_not_zero(&ns->__ns_ref)) return true; VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); @@ -82,12 +86,27 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns return refcount_read(&ns->__ns_ref); } +static __always_inline void __ns_ref_inc(struct ns_common *ns) +{ + if (is_ns_init_id(ns)) + return; + refcount_inc(&ns->__ns_ref); +} + +static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns, + spinlock_t *ns_lock) +{ + if (is_ns_init_id(ns)) + return false; + return refcount_dec_and_lock(&ns->__ns_ref, ns_lock); +} + #define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns))) -#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) +#define ns_ref_inc(__ns) __ns_ref_inc(to_ns_common((__ns))) #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) -#define ns_ref_put_and_lock(__ns, __lock) \ - refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) +#define ns_ref_put_and_lock(__ns, __ns_lock) \ + __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) #define ns_ref_active_read(__ns) \ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) -- cgit v1.2.3 From 2b60d56acc5b4fcab29fc323e6b82597ec78596f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:25 +0100 Subject: ns: add asserts for initial namespace reference counts They always remain fixed at one. Notice when that assumptions is broken. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-13-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 5b8f2f0163d7..dfb6b798ba82 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -60,10 +60,17 @@ static __always_inline __must_check int __ns_ref_active_read(const struct ns_com return atomic_read(&ns->__ns_ref_active); } +static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) +{ + return refcount_read(&ns->__ns_ref); +} + static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - if (is_ns_init_id(ns)) + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); return false; + } if (refcount_dec_and_test(&ns->__ns_ref)) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); return true; @@ -73,31 +80,32 @@ static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - if (is_ns_init_id(ns)) + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); return true; + } if (refcount_inc_not_zero(&ns->__ns_ref)) return true; VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); return false; } -static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) -{ - return refcount_read(&ns->__ns_ref); -} - static __always_inline void __ns_ref_inc(struct ns_common *ns) { - if (is_ns_init_id(ns)) + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); return; + } refcount_inc(&ns->__ns_ref); } static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns, spinlock_t *ns_lock) { - if (is_ns_init_id(ns)) + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); return false; + } return refcount_dec_and_lock(&ns->__ns_ref, ns_lock); } -- cgit v1.2.3 From 7118daabb65585163fd70eb782f1fbbdb64968a6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:26 +0100 Subject: ns: add asserts for initial namespace active reference counts They always remain fixed at one. Notice when that assumptions is broken. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-14-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index dfb6b798ba82..43f709ab846a 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -69,6 +69,7 @@ static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return false; } if (refcount_dec_and_test(&ns->__ns_ref)) { @@ -82,6 +83,7 @@ static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return true; } if (refcount_inc_not_zero(&ns->__ns_ref)) @@ -94,6 +96,7 @@ static __always_inline void __ns_ref_inc(struct ns_common *ns) { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return; } refcount_inc(&ns->__ns_ref); @@ -104,6 +107,7 @@ static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return false; } return refcount_dec_and_lock(&ns->__ns_ref, ns_lock); -- cgit v1.2.3 From 282879afa01936954a570e15b4088a89b6e1b549 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:27 +0100 Subject: pid: rely on common reference count behavior Now that we changed the generic reference counting mechanism for all namespaces to never manipulate reference counts of initial namespaces we can drop the special handling for pid namespaces. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-15-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/pid_namespace.h | 3 +-- kernel/pid_namespace.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 445517a72ad0..0e7ae12c96d2 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -61,8 +61,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { - if (ns != &init_pid_ns) - ns_ref_inc(ns); + ns_ref_inc(ns); return ns; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 650be58d8d18..e48f5de41361 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && ns_ref_put(ns)) + if (ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); -- cgit v1.2.3 From c2bbd2db521b018c59fb0ff8e1cdfa8ee907ba88 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:28 +0100 Subject: ns: drop custom reference count initialization for initial namespaces Initial namespaces don't modify their reference count anymore. They remain fixed at one so drop the custom refcount initializations. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-16-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- include/linux/ns_common.h | 4 ++-- init/version-timestamp.c | 2 +- ipc/msgutil.c | 2 +- kernel/cgroup/cgroup.c | 2 +- kernel/pid.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user.c | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index efaff8680eaf..25289b869be1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5986,7 +5986,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, } struct mnt_namespace init_mnt_ns = { - .ns = NS_COMMON_INIT(init_mnt_ns, 1), + .ns = NS_COMMON_INIT(init_mnt_ns), .user_ns = &init_user_ns, .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 43f709ab846a..136f6a322e53 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -26,14 +26,14 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) return ns->ns_id <= NS_LAST_INIT_ID; } -#define NS_COMMON_INIT(nsname, refs) \ +#define NS_COMMON_INIT(nsname) \ { \ .ns_type = ns_common_type(&nsname), \ .ns_id = ns_init_id(&nsname), \ .inum = ns_init_inum(&nsname), \ .ops = to_ns_operations(&nsname), \ .stashed = NULL, \ - .__ns_ref = REFCOUNT_INIT(refs), \ + .__ns_ref = REFCOUNT_INIT(1), \ .__ns_ref_active = ATOMIC_INIT(1), \ .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \ .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \ diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 56ded64fdfe4..375726e05f69 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,7 +8,7 @@ #include struct uts_namespace init_uts_ns = { - .ns = NS_COMMON_INIT(init_uts_ns, 2), + .ns = NS_COMMON_INIT(init_uts_ns), .name = { .sysname = UTS_SYSNAME, .nodename = UTS_NODENAME, diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 55a908ec0674..e28f0cecb2ec 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -27,7 +27,7 @@ DEFINE_SPINLOCK(mq_lock); * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { - .ns = NS_COMMON_INIT(init_ipc_ns, 1), + .ns = NS_COMMON_INIT(init_ipc_ns), .user_ns = &init_user_ns, }; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 20ab84b2cf4e..2bf3951ca88f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -250,7 +250,7 @@ bool cgroup_enable_per_threadgroup_rwsem __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns = NS_COMMON_INIT(init_cgroup_ns, 2), + .ns = NS_COMMON_INIT(init_cgroup_ns), .user_ns = &init_user_ns, .root_cset = &init_css_set, }; diff --git a/kernel/pid.c b/kernel/pid.c index a5a63dc0a491..a31771bc89c1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,7 +71,7 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns = NS_COMMON_INIT(init_pid_ns, 2), + .ns = NS_COMMON_INIT(init_pid_ns), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 19911f88e2b8..e76be24b132c 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -478,7 +478,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns = NS_COMMON_INIT(init_time_ns, 3), + .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, .frozen_offsets = true, }; diff --git a/kernel/user.c b/kernel/user.c index 4b3132e786d9..7aef4e679a6a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc); * and 1 for... ? */ struct user_namespace init_user_ns = { - .ns = NS_COMMON_INIT(init_user_ns, 3), + .ns = NS_COMMON_INIT(init_user_ns), .uid_map = { { .extent[0] = { -- cgit v1.2.3 From dca3aa666fbd71118905d88bb1c353881002b647 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 9 Nov 2025 13:19:31 +0100 Subject: fs: move inode fields used during fast path lookup closer together This should avoid *some* cache misses. Successful path lookup is guaranteed to load at least ->i_mode, ->i_opflags and ->i_acl. At the same time the common case will avoid looking at more fields. struct inode is not guaranteed to have any particular alignment, notably ext4 has it only aligned to 8 bytes meaning nearby fields might happen to be on the same or only adjacent cache lines depending on luck (or no luck). According to pahole: umode_t i_mode; /* 0 2 */ short unsigned int i_opflags; /* 2 2 */ kuid_t i_uid; /* 4 4 */ kgid_t i_gid; /* 8 4 */ unsigned int i_flags; /* 12 4 */ struct posix_acl * i_acl; /* 16 8 */ struct posix_acl * i_default_acl; /* 24 8 */ ->i_acl is unnecessarily separated by 8 bytes from the other fields. With struct inode being offset 48 bytes into the cacheline this means an avoidable miss. Note it will still be there for the 56 byte case. New layout: umode_t i_mode; /* 0 2 */ short unsigned int i_opflags; /* 2 2 */ unsigned int i_flags; /* 4 4 */ struct posix_acl * i_acl; /* 8 8 */ struct posix_acl * i_default_acl; /* 16 8 */ kuid_t i_uid; /* 24 4 */ kgid_t i_gid; /* 28 4 */ I verified with pahole there are no size or hole changes. This is stopgap until someone(tm) sanitizes the layout in the first place, allocation methods aside. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251109121931.1285366-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c0c0095b2b60..64dc2e2c281f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -781,14 +781,13 @@ enum inode_state_flags_t { struct inode { umode_t i_mode; unsigned short i_opflags; - kuid_t i_uid; - kgid_t i_gid; unsigned int i_flags; - #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif + kuid_t i_uid; + kgid_t i_gid; const struct inode_operations *i_op; struct super_block *i_sb; -- cgit v1.2.3 From f5a538c07df26f5c601e41f7b9c7ade3e1e75803 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 31 Oct 2025 13:54:24 +0100 Subject: sched/deadline: Fix dl_server stop condition Gabriel reported that the dl_server doesn't stop as expected. The problem was found to be the fact that idle time and fair runtime are treated equally. Both will count towards dl_server runtime and push the activation forwards when it is in the zero-laxity wait state. Notably: dl_server_update_idle() update_curr_dl_se() if (dl_defer && dl_throttled && dl_runtime_exceeded()) hrtimer_try_to_cancel(); // stop timer replenish_dl_new_period() deadline = now + dl_deadline; // fwd period runtime = dl_runtime; start_dl_timer(); // restart timer And while we do want idle time accounted towards the *current* activation of the dl_server -- after all, a fair task could've ran if we had any -- we don't necessarily want idle time to cause or push forward an activation. Introduce dl_defer_idle to make this distinction. It will be set once idle time pushed the activation forward, once set idle time will only be allowed to consume any runtime but not push the activation. This will then cause dl_server_timer() to fire, which will stop the dl_server. Any non-idle time accounting during this phase will clear dl_defer_idle, so only a full period of idle will cause the dl_server to stop. Reported-by: Gabriele Monaco Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251101000057.GA2184199@noisy.programming.kicks-ass.net --- include/linux/sched.h | 15 +++++++++------ kernel/sched/deadline.c | 40 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 07576479c0ed..bb436ee1942d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -685,20 +685,22 @@ struct sched_dl_entity { * * @dl_server tells if this is a server entity. * - * @dl_defer tells if this is a deferred or regular server. For - * now only defer server exists. - * - * @dl_defer_armed tells if the deferrable server is waiting - * for the replenishment timer to activate it. - * * @dl_server_active tells if the dlserver is active(started). * dlserver is started on first cfs enqueue on an idle runqueue * and is stopped when a dequeue results in 0 cfs tasks on the * runqueue. In other words, dlserver is active only when cpu's * runqueue has atleast one cfs task. * + * @dl_defer tells if this is a deferred or regular server. For + * now only defer server exists. + * + * @dl_defer_armed tells if the deferrable server is waiting + * for the replenishment timer to activate it. + * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. + * + * @dl_defer_idle tracks idle state */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; @@ -709,6 +711,7 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; + unsigned int dl_defer_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ece25caf379c..8307f24b8900 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1173,6 +1173,11 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ */ rq->donor->sched_class->update_curr(rq); + if (dl_se->dl_defer_idle) { + dl_server_stop(dl_se); + return HRTIMER_NORESTART; + } + if (dl_se->dl_defer_armed) { /* * First check if the server could consume runtime in background. @@ -1420,10 +1425,11 @@ s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta } static inline void -update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, - int flags); +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags); + static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { + bool idle = rq->curr == rq->idle; s64 scaled_delta_exec; if (unlikely(delta_exec <= 0)) { @@ -1444,6 +1450,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 dl_se->runtime -= scaled_delta_exec; + if (dl_se->dl_defer_idle && !idle) + dl_se->dl_defer_idle = 0; + /* * The fair server can consume its runtime while throttled (not queued/ * running as regular CFS). @@ -1453,6 +1462,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 * starting a new period, pushing the activation. */ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { + /* + * Non-servers would never get time accounted while throttled. + */ + WARN_ON_ONCE(!dl_server(dl_se)); + + /* + * While the server is marked idle, do not push out the + * activation further, instead wait for the period timer + * to lapse and stop the server. + */ + if (dl_se->dl_defer_idle && idle) { + /* + * The timer is at the zero-laxity point, this means + * dl_server_stop() / dl_server_start() can happen + * while now < deadline. This means update_dl_entity() + * will not replenish. Additionally start_dl_timer() + * will be set for 'deadline - runtime'. Negative + * runtime will not do. + */ + dl_se->runtime = 0; + return; + } + /* * If the server was previously activated - the starving condition * took place, it this point it went away because the fair scheduler @@ -1465,6 +1497,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 replenish_dl_new_period(dl_se, dl_se->rq); + if (idle) + dl_se->dl_defer_idle = 1; + /* * Not being able to start the timer seems problematic. If it could not * be started for whatever reason, we need to "unthrottle" the DL server @@ -1590,6 +1625,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se) hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; dl_se->dl_throttled = 0; + dl_se->dl_defer_idle = 0; dl_se->dl_server_active = 0; } -- cgit v1.2.3 From 9da611df15aa8d519f9947b88a5c733267cba888 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 7 Nov 2025 23:04:04 -0800 Subject: net/mlx5: E-Switch, support eswitch inactive mode Add support for eswitch switchdev inactive mode Inactive mode: Drop all traffic going to FDB, Remove mpfs l2 rules and disconnect adjacent vports. Active mode: Traffic flows through FDB, mpfs table populated, and adjacent vports are connected. Signed-off-by: Saeed Mahameed Signed-off-by: Adithya Jayachandran Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20251108070404.1551708-4-saeed@kernel.org Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/esw/adj_vport.c | 15 +- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 6 + .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 207 ++++++++++++++++++++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 5 + drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c | 2 +- include/linux/mlx5/fs.h | 1 + 6 files changed, 214 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c index 0091ba697bae..250af09b5af2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/adj_vport.c @@ -4,13 +4,8 @@ #include "fs_core.h" #include "eswitch.h" -enum { - MLX5_ADJ_VPORT_DISCONNECT = 0x0, - MLX5_ADJ_VPORT_CONNECT = 0x1, -}; - -static int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev, - u16 vport, bool connect) +int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev, u16 vport, + bool connect) { u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {}; @@ -24,7 +19,7 @@ static int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev, MLX5_SET(modify_vport_state_in, in, egress_connect_valid, 1); MLX5_SET(modify_vport_state_in, in, ingress_connect, connect); MLX5_SET(modify_vport_state_in, in, egress_connect, connect); - + MLX5_SET(modify_vport_state_in, in, admin_state, connect); return mlx5_cmd_exec_in(dev, modify_vport_state, in); } @@ -96,7 +91,6 @@ static int mlx5_esw_adj_vport_create(struct mlx5_eswitch *esw, u16 vhca_id, if (err) goto acl_ns_remove; - mlx5_esw_adj_vport_modify(esw->dev, vport_num, MLX5_ADJ_VPORT_CONNECT); return 0; acl_ns_remove: @@ -117,8 +111,7 @@ static void mlx5_esw_adj_vport_destroy(struct mlx5_eswitch *esw, esw_debug(esw->dev, "Destroying adjacent vport %d for vhca_id 0x%x\n", vport_num, vport->vhca_id); - mlx5_esw_adj_vport_modify(esw->dev, vport_num, - MLX5_ADJ_VPORT_DISCONNECT); + mlx5_esw_offloads_rep_remove(esw, vport); mlx5_fs_vport_egress_acl_ns_remove(esw->dev->priv.steering, vport->index); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 16eb99aba2a7..beaec450a734 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -264,6 +264,9 @@ struct mlx5_eswitch_fdb { struct offloads_fdb { struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *drop_root; + struct mlx5_flow_handle *drop_root_rule; + struct mlx5_fc *drop_root_fc; struct mlx5_flow_table *tc_miss_table; struct mlx5_flow_table *slow_fdb; struct mlx5_flow_group *send_to_vport_grp; @@ -392,6 +395,7 @@ struct mlx5_eswitch { struct mlx5_esw_offload offloads; u32 last_vport_idx; int mode; + bool offloads_inactive; u16 manager_vport; u16 first_host_vport; u8 num_peers; @@ -634,6 +638,8 @@ const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev); void mlx5_esw_adjacent_vhcas_setup(struct mlx5_eswitch *esw); void mlx5_esw_adjacent_vhcas_cleanup(struct mlx5_eswitch *esw); +int mlx5_esw_adj_vport_modify(struct mlx5_core_dev *dev, u16 vport, + bool connect); #define MLX5_DEBUG_ESWITCH_MASK BIT(3) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 4092ea29c630..0b1a180ef238 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1577,6 +1577,7 @@ esw_chains_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *miss_fdb) attr.max_grp_num = esw->params.large_group_num; attr.default_ft = miss_fdb; attr.mapping = esw->offloads.reg_c0_obj_pool; + attr.fs_base_prio = FDB_BYPASS_PATH; chains = mlx5_chains_create(dev, &attr); if (IS_ERR(chains)) { @@ -2355,6 +2356,131 @@ static void esw_mode_change(struct mlx5_eswitch *esw, u16 mode) mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp); } +static void mlx5_esw_fdb_drop_destroy(struct mlx5_eswitch *esw) +{ + if (!esw->fdb_table.offloads.drop_root) + return; + + esw_debug(esw->dev, "Destroying FDB drop root table %#x fc %#x\n", + esw->fdb_table.offloads.drop_root->id, + esw->fdb_table.offloads.drop_root_fc->id); + mlx5_del_flow_rules(esw->fdb_table.offloads.drop_root_rule); + /* Don't free flow counter here, can be reused on a later activation */ + mlx5_destroy_flow_table(esw->fdb_table.offloads.drop_root); + esw->fdb_table.offloads.drop_root_rule = NULL; + esw->fdb_table.offloads.drop_root = NULL; +} + +static int mlx5_esw_fdb_drop_create(struct mlx5_eswitch *esw) +{ + struct mlx5_flow_destination drop_fc_dst = {}; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *root_ns; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_table *table; + int err = 0, dst_num = 0; + + if (esw->fdb_table.offloads.drop_root) + return 0; + + root_ns = esw->fdb_table.offloads.ns; + + ft_attr.prio = FDB_DROP_ROOT; + ft_attr.max_fte = 1; + ft_attr.autogroup.max_num_groups = 1; + table = mlx5_create_auto_grouped_flow_table(root_ns, &ft_attr); + if (IS_ERR(table)) { + esw_warn(dev, "Failed to create fdb drop root table, err %pe\n", + table); + return PTR_ERR(table); + } + + /* Drop FC reusable, create once on first deactivation of FDB */ + if (!esw->fdb_table.offloads.drop_root_fc) { + struct mlx5_fc *counter = mlx5_fc_create(dev, 0); + + err = PTR_ERR_OR_ZERO(counter); + if (err) + esw_warn(esw->dev, "create fdb drop fc err %d\n", err); + else + esw->fdb_table.offloads.drop_root_fc = counter; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + if (esw->fdb_table.offloads.drop_root_fc) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_fc_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_fc_dst.counter = esw->fdb_table.offloads.drop_root_fc; + dst = &drop_fc_dst; + dst_num++; + } + + flow_rule = mlx5_add_flow_rules(table, NULL, &flow_act, dst, dst_num); + err = PTR_ERR_OR_ZERO(flow_rule); + if (err) { + esw_warn(esw->dev, + "fs offloads: Failed to add vport rx drop rule err %d\n", + err); + goto err_flow_rule; + } + + esw->fdb_table.offloads.drop_root = table; + esw->fdb_table.offloads.drop_root_rule = flow_rule; + esw_debug(esw->dev, "Created FDB drop root table %#x fc %#x\n", + table->id, dst ? dst->counter->id : 0); + return 0; + +err_flow_rule: + /* no need to free drop fc, esw_offloads_steering_cleanup will do it */ + mlx5_destroy_flow_table(table); + return err; +} + +static void mlx5_esw_fdb_active(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_esw_fdb_drop_destroy(esw); + mlx5_mpfs_enable(esw->dev); + + mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) { + if (!vport->adjacent) + continue; + esw_debug(esw->dev, "Connecting vport %d to eswitch\n", + vport->vport); + mlx5_esw_adj_vport_modify(esw->dev, vport->vport, true); + } + + esw->offloads_inactive = false; + esw_warn(esw->dev, "MPFS/FDB active\n"); +} + +static void mlx5_esw_fdb_inactive(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_mpfs_disable(esw->dev); + mlx5_esw_fdb_drop_create(esw); + + mlx5_esw_for_each_vf_vport(esw, i, vport, U16_MAX) { + if (!vport->adjacent) + continue; + esw_debug(esw->dev, "Disconnecting vport %u from eswitch\n", + vport->vport); + + mlx5_esw_adj_vport_modify(esw->dev, vport->vport, false); + } + + esw->offloads_inactive = true; + esw_warn(esw->dev, "MPFS/FDB inactive\n"); +} + static int esw_offloads_start(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack) { @@ -3438,6 +3564,10 @@ create_indir_err: static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw) { + mlx5_esw_fdb_drop_destroy(esw); + if (esw->fdb_table.offloads.drop_root_fc) + mlx5_fc_destroy(esw->dev, esw->fdb_table.offloads.drop_root_fc); + esw->fdb_table.offloads.drop_root_fc = NULL; esw_destroy_vport_rx_drop_rule(esw); esw_destroy_vport_rx_drop_group(esw); esw_destroy_vport_rx_group(esw); @@ -3600,6 +3730,11 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) if (err) goto err_steering_init; + if (esw->offloads_inactive) + mlx5_esw_fdb_inactive(esw); + else + mlx5_esw_fdb_active(esw); + /* Representor will control the vport link state */ mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) vport->info.link_state = MLX5_VPORT_ADMIN_STATE_DOWN; @@ -3666,6 +3801,9 @@ void esw_offloads_disable(struct mlx5_eswitch *esw) esw_offloads_metadata_uninit(esw); mlx5_rdma_disable_roce(esw->dev); mlx5_esw_adjacent_vhcas_cleanup(esw); + /* must be done after vhcas cleanup to avoid adjacent vports connect */ + if (esw->offloads_inactive) + mlx5_esw_fdb_active(esw); /* legacy mode always active */ mutex_destroy(&esw->offloads.termtbl_mutex); } @@ -3676,6 +3814,7 @@ static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode) *mlx5_mode = MLX5_ESWITCH_LEGACY; break; case DEVLINK_ESWITCH_MODE_SWITCHDEV: + case DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE: *mlx5_mode = MLX5_ESWITCH_OFFLOADS; break; default: @@ -3685,14 +3824,17 @@ static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode) return 0; } -static int esw_mode_to_devlink(u16 mlx5_mode, u16 *mode) +static int esw_mode_to_devlink(struct mlx5_eswitch *esw, u16 *mode) { - switch (mlx5_mode) { + switch (esw->mode) { case MLX5_ESWITCH_LEGACY: *mode = DEVLINK_ESWITCH_MODE_LEGACY; break; case MLX5_ESWITCH_OFFLOADS: - *mode = DEVLINK_ESWITCH_MODE_SWITCHDEV; + if (esw->offloads_inactive) + *mode = DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE; + else + *mode = DEVLINK_ESWITCH_MODE_SWITCHDEV; break; default: return -EINVAL; @@ -3798,6 +3940,45 @@ static bool mlx5_devlink_netdev_netns_immutable_set(struct devlink *devlink, return ret; } +/* Returns true when only changing between active and inactive switchdev mode */ +static bool mlx5_devlink_switchdev_active_mode_change(struct mlx5_eswitch *esw, + u16 devlink_mode) +{ + /* current mode is not switchdev */ + if (esw->mode != MLX5_ESWITCH_OFFLOADS) + return false; + + /* new mode is not switchdev */ + if (devlink_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV && + devlink_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE) + return false; + + /* already inactive: no change in current state */ + if (devlink_mode == DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE && + esw->offloads_inactive) + return false; + + /* already active: no change in current state */ + if (devlink_mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && + !esw->offloads_inactive) + return false; + + down_write(&esw->mode_lock); + esw->offloads_inactive = !esw->offloads_inactive; + esw->eswitch_operation_in_progress = true; + up_write(&esw->mode_lock); + + if (esw->offloads_inactive) + mlx5_esw_fdb_inactive(esw); + else + mlx5_esw_fdb_active(esw); + + down_write(&esw->mode_lock); + esw->eswitch_operation_in_progress = false; + up_write(&esw->mode_lock); + return true; +} + int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack) { @@ -3812,12 +3993,16 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, if (esw_mode_from_devlink(mode, &mlx5_mode)) return -EINVAL; - if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && mlx5_get_sd(esw->dev)) { + if (mlx5_mode == MLX5_ESWITCH_OFFLOADS && mlx5_get_sd(esw->dev)) { NL_SET_ERR_MSG_MOD(extack, "Can't change E-Switch mode to switchdev when multi-PF netdev (Socket Direct) is configured."); return -EPERM; } + /* Avoid try_lock, active/inactive mode change is not restricted */ + if (mlx5_devlink_switchdev_active_mode_change(esw, mode)) + return 0; + mlx5_lag_disable_change(esw->dev); err = mlx5_esw_try_lock(esw); if (err < 0) { @@ -3840,7 +4025,7 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, esw->eswitch_operation_in_progress = true; up_write(&esw->mode_lock); - if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && + if (mlx5_mode == MLX5_ESWITCH_OFFLOADS && !mlx5_devlink_netdev_netns_immutable_set(devlink, true)) { NL_SET_ERR_MSG_MOD(extack, "Can't change E-Switch mode to switchdev when netdev net namespace has diverged from the devlink's."); @@ -3848,25 +4033,27 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, goto skip; } - if (mode == DEVLINK_ESWITCH_MODE_LEGACY) + if (mlx5_mode == MLX5_ESWITCH_LEGACY) esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY; mlx5_eswitch_disable_locked(esw); - if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { + if (mlx5_mode == MLX5_ESWITCH_OFFLOADS) { if (mlx5_devlink_trap_get_num_active(esw->dev)) { NL_SET_ERR_MSG_MOD(extack, "Can't change mode while devlink traps are active"); err = -EOPNOTSUPP; goto skip; } + esw->offloads_inactive = + (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE); err = esw_offloads_start(esw, extack); - } else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) { + } else if (mlx5_mode == MLX5_ESWITCH_LEGACY) { err = esw_offloads_stop(esw, extack); } else { err = -EINVAL; } skip: - if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && err) + if (mlx5_mode == MLX5_ESWITCH_OFFLOADS && err) mlx5_devlink_netdev_netns_immutable_set(devlink, false); down_write(&esw->mode_lock); esw->eswitch_operation_in_progress = false; @@ -3885,7 +4072,7 @@ int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) if (IS_ERR(esw)) return PTR_ERR(esw); - return esw_mode_to_devlink(esw->mode, mode); + return esw_mode_to_devlink(esw, mode); } static int mlx5_esw_vports_inline_set(struct mlx5_eswitch *esw, u8 mlx5_mode, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2db3ffb0a2b2..2ca3bddbdf05 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -3520,6 +3520,11 @@ static int init_fdb_root_ns(struct mlx5_flow_steering *steering) if (!steering->fdb_root_ns) return -ENOMEM; + maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_DROP_ROOT, 1); + err = PTR_ERR_OR_ZERO(maj_prio); + if (err) + goto out_err; + err = create_fdb_bypass(steering); if (err) goto out_err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c index 99fb7a53add0..4a88a42ae4f7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c @@ -167,7 +167,7 @@ int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) if (err) goto free_l2table_index; mlx5_core_dbg(dev, "MPFS entry %pM, set @index (%d)\n", - l2addr->node.addr, l2addr->index); + l2addr->node.addr, index); } l2addr->index = index; diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 6ac76a0c3827..7bf2449c53b2 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -116,6 +116,7 @@ enum mlx5_flow_namespace_type { }; enum { + FDB_DROP_ROOT, FDB_BYPASS_PATH, FDB_CRYPTO_INGRESS, FDB_TC_OFFLOAD, -- cgit v1.2.3 From 4f739ed19d222de33b19ca639a34523fbbec20d0 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 14 Oct 2025 07:51:56 +0200 Subject: rv: Pass va_list to reactors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only thing the reactors can do with the passed in varargs is to convert it into a va_list. Do that in a central helper instead. It simplifies the reactors, removes some hairy macro-generated code and introduces a convenient hook point to modify reactor behavior. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-1-0b9e51919ea8@linutronix.de Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 11 +++++++++-- include/rv/da_monitor.h | 35 ++++++++++------------------------- include/rv/ltl_monitor.h | 18 +++++------------- kernel/trace/rv/reactor_panic.c | 6 +----- kernel/trace/rv/reactor_printk.c | 6 +----- kernel/trace/rv/rv_reactors.c | 16 +++++++++++++++- 6 files changed, 41 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rv.h b/include/linux/rv.h index 9520aab34bcb..b567b0191e67 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -88,7 +88,7 @@ union rv_task_monitor { struct rv_reactor { const char *name; const char *description; - __printf(1, 2) void (*react)(const char *msg, ...); + __printf(1, 0) void (*react)(const char *msg, va_list args); struct list_head list; }; #endif @@ -102,7 +102,7 @@ struct rv_monitor { void (*reset)(void); #ifdef CONFIG_RV_REACTORS struct rv_reactor *reactor; - __printf(1, 2) void (*react)(const char *msg, ...); + __printf(1, 0) void (*react)(const char *msg, va_list args); #endif struct list_head list; struct rv_monitor *parent; @@ -119,11 +119,18 @@ void rv_put_task_monitor_slot(int slot); bool rv_reacting_on(void); int rv_unregister_reactor(struct rv_reactor *reactor); int rv_register_reactor(struct rv_reactor *reactor); +__printf(2, 3) +void rv_react(struct rv_monitor *monitor, const char *msg, ...); #else static inline bool rv_reacting_on(void) { return false; } + +__printf(2, 3) +static inline void rv_react(struct rv_monitor *monitor, const char *msg, ...) +{ +} #endif /* CONFIG_RV_REACTORS */ #endif /* CONFIG_RV */ diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h index 17fa4f6e5ea6..0cef64366538 100644 --- a/include/rv/da_monitor.h +++ b/include/rv/da_monitor.h @@ -16,34 +16,19 @@ #include #include -#ifdef CONFIG_RV_REACTORS - -#define DECLARE_RV_REACTING_HELPERS(name, type) \ -static void cond_react_##name(type curr_state, type event) \ -{ \ - if (!rv_reacting_on() || !rv_##name.react) \ - return; \ - rv_##name.react("rv: monitor %s does not allow event %s on state %s\n", \ - #name, \ - model_get_event_name_##name(event), \ - model_get_state_name_##name(curr_state)); \ -} - -#else /* CONFIG_RV_REACTOR */ - -#define DECLARE_RV_REACTING_HELPERS(name, type) \ -static void cond_react_##name(type curr_state, type event) \ -{ \ - return; \ -} -#endif - /* * Generic helpers for all types of deterministic automata monitors. */ #define DECLARE_DA_MON_GENERIC_HELPERS(name, type) \ \ -DECLARE_RV_REACTING_HELPERS(name, type) \ +static void react_##name(type curr_state, type event) \ +{ \ + rv_react(&rv_##name, \ + "rv: monitor %s does not allow event %s on state %s\n", \ + #name, \ + model_get_event_name_##name(event), \ + model_get_state_name_##name(curr_state)); \ +} \ \ /* \ * da_monitor_reset_##name - reset a monitor and setting it to init state \ @@ -126,7 +111,7 @@ da_event_##name(struct da_monitor *da_mon, enum events_##name event) \ for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) { \ next_state = model_get_next_state_##name(curr_state, event); \ if (next_state == INVALID_STATE) { \ - cond_react_##name(curr_state, event); \ + react_##name(curr_state, event); \ trace_error_##name(model_get_state_name_##name(curr_state), \ model_get_event_name_##name(event)); \ return false; \ @@ -165,7 +150,7 @@ static inline bool da_event_##name(struct da_monitor *da_mon, struct task_struct for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) { \ next_state = model_get_next_state_##name(curr_state, event); \ if (next_state == INVALID_STATE) { \ - cond_react_##name(curr_state, event); \ + react_##name(curr_state, event); \ trace_error_##name(tsk->pid, \ model_get_state_name_##name(curr_state), \ model_get_event_name_##name(event)); \ diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h index 5368cf5fd623..00c42b36f961 100644 --- a/include/rv/ltl_monitor.h +++ b/include/rv/ltl_monitor.h @@ -16,21 +16,12 @@ #error "Please include $(MODEL_NAME).h generated by rvgen" #endif -#ifdef CONFIG_RV_REACTORS #define RV_MONITOR_NAME CONCATENATE(rv_, MONITOR_NAME) -static struct rv_monitor RV_MONITOR_NAME; -static void rv_cond_react(struct task_struct *task) -{ - if (!rv_reacting_on() || !RV_MONITOR_NAME.react) - return; - RV_MONITOR_NAME.react("rv: "__stringify(MONITOR_NAME)": %s[%d]: violation detected\n", - task->comm, task->pid); -} +#ifdef CONFIG_RV_REACTORS +static struct rv_monitor RV_MONITOR_NAME; #else -static void rv_cond_react(struct task_struct *task) -{ -} +extern struct rv_monitor RV_MONITOR_NAME; #endif static int ltl_monitor_slot = RV_PER_TASK_MONITOR_INIT; @@ -98,7 +89,8 @@ static void ltl_monitor_destroy(void) static void ltl_illegal_state(struct task_struct *task, struct ltl_monitor *mon) { CONCATENATE(trace_error_, MONITOR_NAME)(task); - rv_cond_react(task); + rv_react(&RV_MONITOR_NAME, "rv: "__stringify(MONITOR_NAME)": %s[%d]: violation detected\n", + task->comm, task->pid); } static void ltl_attempt_start(struct task_struct *task, struct ltl_monitor *mon) diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index 74c6bcc2c749..76537b8a4343 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -13,13 +13,9 @@ #include #include -__printf(1, 2) static void rv_panic_reaction(const char *msg, ...) +__printf(1, 0) static void rv_panic_reaction(const char *msg, va_list args) { - va_list args; - - va_start(args, msg); vpanic(msg, args); - va_end(args); } static struct rv_reactor rv_panic = { diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 2dae2916c05f..48c934e315b3 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -12,13 +12,9 @@ #include #include -__printf(1, 2) static void rv_printk_reaction(const char *msg, ...) +__printf(1, 0) static void rv_printk_reaction(const char *msg, va_list args) { - va_list args; - - va_start(args, msg); vprintk_deferred(msg, args); - va_end(args); } static struct rv_reactor rv_printk = { diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index d32859fec238..cb1a5968055a 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -438,7 +438,7 @@ int reactor_populate_monitor(struct rv_monitor *mon) /* * Nop reactor register */ -__printf(1, 2) static void rv_nop_reaction(const char *msg, ...) +__printf(1, 0) static void rv_nop_reaction(const char *msg, va_list args) { } @@ -477,3 +477,17 @@ rm_available: out_err: return -ENOMEM; } + +void rv_react(struct rv_monitor *monitor, const char *msg, ...) +{ + va_list args; + + if (!rv_reacting_on() || !monitor->react) + return; + + va_start(args, msg); + + monitor->react(msg, args); + + va_end(args); +} -- cgit v1.2.3 From 68f63cea46d3a410a41d9ab74d338038a22bc2ad Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 14 Oct 2025 07:51:57 +0200 Subject: rv: Make rv_reacting_on() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no external users left. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-2-0b9e51919ea8@linutronix.de Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 6 ------ kernel/trace/rv/rv_reactors.c | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rv.h b/include/linux/rv.h index b567b0191e67..92fd467547e7 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -116,17 +116,11 @@ int rv_get_task_monitor_slot(void); void rv_put_task_monitor_slot(int slot); #ifdef CONFIG_RV_REACTORS -bool rv_reacting_on(void); int rv_unregister_reactor(struct rv_reactor *reactor); int rv_register_reactor(struct rv_reactor *reactor); __printf(2, 3) void rv_react(struct rv_monitor *monitor, const char *msg, ...); #else -static inline bool rv_reacting_on(void) -{ - return false; -} - __printf(2, 3) static inline void rv_react(struct rv_monitor *monitor, const char *msg, ...) { diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index cb1a5968055a..8c02426bc3bd 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -347,7 +347,7 @@ static bool __read_mostly reacting_on; * * Returns 1 if on, 0 otherwise. */ -bool rv_reacting_on(void) +static bool rv_reacting_on(void) { /* Ensures that concurrent monitors read consistent reacting_on */ smp_rmb(); -- cgit v1.2.3 From e5eba42f01340f73888dfe560be2806057c25913 Mon Sep 17 00:00:00 2001 From: Akiva Goldberger Date: Sun, 9 Nov 2025 11:49:03 +0200 Subject: mlx5: Fix default values in create CQ Currently, CQs without a completion function are assigned the mlx5_add_cq_to_tasklet function by default. This is problematic since only user CQs created through the mlx5_ib driver are intended to use this function. Additionally, all CQs that will use doorbells instead of polling for completions must call mlx5_cq_arm. However, the default CQ creation flow leaves a valid value in the CQ's arm_db field, allowing FW to send interrupts to polling-only CQs in certain corner cases. These two factors would allow a polling-only kernel CQ to be triggered by an EQ interrupt and call a completion function intended only for user CQs, causing a null pointer exception. Some areas in the driver have prevented this issue with one-off fixes but did not address the root cause. This patch fixes the described issue by adding defaults to the create CQ flow. It adds a default dummy completion function to protect against null pointer exceptions, and it sets an invalid command sequence number by default in kernel CQs to prevent the FW from sending an interrupt to the CQ until it is armed. User CQs are responsible for their own initialization values. Callers of mlx5_core_create_cq are responsible for changing the completion function and arming the CQ per their needs. Fixes: cdd04f4d4d71 ("net/mlx5: Add support to create SQ and CQ for ASO") Signed-off-by: Akiva Goldberger Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Acked-by: Leon Romanovsky Link: https://patch.msgid.link/1762681743-1084694-1-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/infiniband/hw/mlx5/cq.c | 11 +++++---- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 23 +++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 - .../net/ethernet/mellanox/mlx5/core/fpga/conn.c | 15 ++++++------ .../mellanox/mlx5/core/steering/hws/send.c | 7 ------ .../mellanox/mlx5/core/steering/sws/dr_send.c | 28 ++++++---------------- drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 ++--- include/linux/mlx5/cq.h | 1 + 8 files changed, 44 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index a23b364e24ff..651d76bca114 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1020,15 +1020,18 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN) MLX5_SET(cqc, cqc, oi, 1); + if (udata) { + cq->mcq.comp = mlx5_add_cq_to_tasklet; + cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; + } else { + cq->mcq.comp = mlx5_ib_cq_comp; + } + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out)); if (err) goto err_cqb; mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); - if (udata) - cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; - else - cq->mcq.comp = mlx5_ib_cq_comp; cq->mcq.event = mlx5_ib_cq_event; INIT_LIST_HEAD(&cq->wc_list); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index e9f319a9bdd6..60f7ab1d72e7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -66,8 +66,8 @@ void mlx5_cq_tasklet_cb(struct tasklet_struct *t) tasklet_schedule(&ctx->task); } -static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, - struct mlx5_eqe *eqe) +void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, + struct mlx5_eqe *eqe) { unsigned long flags; struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv; @@ -95,7 +95,15 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, if (schedule_tasklet) tasklet_schedule(&tasklet_ctx->task); } +EXPORT_SYMBOL(mlx5_add_cq_to_tasklet); +static void mlx5_core_cq_dummy_cb(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) +{ + mlx5_core_err(cq->eq->core.dev, + "CQ default completion callback, CQ #%u\n", cq->cqn); +} + +#define MLX5_CQ_INIT_CMD_SN cpu_to_be32(2 << 28) /* Callers must verify outbox status in case of err */ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen) @@ -121,10 +129,19 @@ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, cq->arm_sn = 0; cq->eq = eq; cq->uid = MLX5_GET(create_cq_in, in, uid); + + /* Kernel CQs must set the arm_db address prior to calling + * this function, allowing for the proper value to be + * initialized. User CQs are responsible for their own + * initialization since they do not use the arm_db field. + */ + if (cq->arm_db) + *cq->arm_db = MLX5_CQ_INIT_CMD_SN; + refcount_set(&cq->refcount, 1); init_completion(&cq->free); if (!cq->comp) - cq->comp = mlx5_add_cq_to_tasklet; + cq->comp = mlx5_core_cq_dummy_cb; /* assuming CQ will be deleted before the EQ */ cq->tasklet_ctx.priv = &eq->tasklet_ctx; INIT_LIST_HEAD(&cq->tasklet_ctx.list); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 6023bbbf3f39..5e17eae81f4b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2219,7 +2219,6 @@ static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; *mcq->set_ci_db = 0; - *mcq->arm_db = 0; mcq->vector = param->eq_ix; mcq->comp = mlx5e_completion_event; mcq->event = mlx5e_cq_error_event; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index cb1319974f83..ccef64fb40b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -421,6 +421,13 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) __be64 *pas; u32 i; + conn->cq.mcq.cqe_sz = 64; + conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db; + conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1; + *conn->cq.mcq.set_ci_db = 0; + conn->cq.mcq.vector = 0; + conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; + cq_size = roundup_pow_of_two(cq_size); MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(cq_size)); @@ -468,15 +475,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) if (err) goto err_cqwq; - conn->cq.mcq.cqe_sz = 64; - conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db; - conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1; - *conn->cq.mcq.set_ci_db = 0; - *conn->cq.mcq.arm_db = 0; - conn->cq.mcq.vector = 0; - conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet); - mlx5_fpga_dbg(fdev, "Created CQ #0x%x\n", conn->cq.mcq.cqn); goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index 24ef7d66fa8a..7510c46e58a5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -873,12 +873,6 @@ err_free_sqc: return err; } -static void hws_cq_complete(struct mlx5_core_cq *mcq, - struct mlx5_eqe *eqe) -{ - pr_err("CQ completion CQ: #%u\n", mcq->cqn); -} - static int hws_send_ring_alloc_cq(struct mlx5_core_dev *mdev, int numa_node, struct mlx5hws_send_engine *queue, @@ -901,7 +895,6 @@ static int hws_send_ring_alloc_cq(struct mlx5_core_dev *mdev, mcq->cqe_sz = 64; mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; - mcq->comp = hws_cq_complete; for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { cqe = mlx5_cqwq_get_wqe(&cq->wq, i); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index 077a77fde670..d034372fa047 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1049,12 +1049,6 @@ static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) return 0; } -static void dr_cq_complete(struct mlx5_core_cq *mcq, - struct mlx5_eqe *eqe) -{ - pr_err("CQ completion CQ: #%u\n", mcq->cqn); -} - static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, struct mlx5_uars_page *uar, size_t ncqe) @@ -1089,6 +1083,13 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK; } + cq->mcq.cqe_sz = 64; + cq->mcq.set_ci_db = cq->wq_ctrl.db.db; + cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; + *cq->mcq.set_ci_db = 0; + cq->mcq.vector = 0; + cq->mdev = mdev; + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + sizeof(u64) * cq->wq_ctrl.buf.npages; in = kvzalloc(inlen, GFP_KERNEL); @@ -1112,27 +1113,12 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas); - cq->mcq.comp = dr_cq_complete; - err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); kvfree(in); if (err) goto err_cqwq; - cq->mcq.cqe_sz = 64; - cq->mcq.set_ci_db = cq->wq_ctrl.db.db; - cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; - *cq->mcq.set_ci_db = 0; - - /* set no-zero value, in order to avoid the HW to run db-recovery on - * CQ that used in polling mode. - */ - *cq->mcq.arm_db = cpu_to_be32(2 << 28); - - cq->mcq.vector = 0; - cq->mdev = mdev; - return cq; err_cqwq: diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 82034efb74fc..a7936bd1aabe 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -573,6 +573,8 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) vcq->mcq.set_ci_db = vcq->db.db; vcq->mcq.arm_db = vcq->db.db + 1; vcq->mcq.cqe_sz = 64; + vcq->mcq.comp = mlx5_vdpa_cq_comp; + vcq->cqe = num_ent; err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent); if (err) @@ -612,10 +614,6 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) if (err) goto err_vec; - vcq->mcq.comp = mlx5_vdpa_cq_comp; - vcq->cqe = num_ent; - vcq->mcq.set_ci_db = vcq->db.db; - vcq->mcq.arm_db = vcq->db.db + 1; mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index); kfree(in); return 0; diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 7ef2c7c7d803..9d47cdc727ad 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -183,6 +183,7 @@ static inline void mlx5_cq_put(struct mlx5_core_cq *cq) complete(&cq->free); } +void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe); int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen); int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, -- cgit v1.2.3 From c07a491c1b735e0c27454ea5c27a446d43401b1e Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 31 Oct 2025 19:24:48 -0700 Subject: net: export netdev_get_by_index_lock() Need to call netdev_get_by_index_lock() from io_uring/zcrx.c, but it is currently private to net. Export the function in linux/netdevice.h. Signed-off-by: David Wei Acked-by: Jakub Kicinski Signed-off-by: Jens Axboe --- include/linux/netdevice.h | 1 + net/core/dev.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d1a687444b27..77c46a2823ec 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3401,6 +3401,7 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp); +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, diff --git a/net/core/dev.h b/net/core/dev.h index 900880e8b5b4..df8a90fe89f8 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -29,7 +29,6 @@ struct napi_struct * netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); -struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, -- cgit v1.2.3 From cb46a58d77e5b433e9f4538faaa2a73970157e8d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 10 Oct 2025 03:44:04 -0700 Subject: efi/memattr: Convert efi_memattr_init() return type to void The efi_memattr_init() function's return values (0 and -ENOMEM) are never checked by callers. Convert the function to return void since the return status is unused. Signed-off-by: Breno Leitao Acked-by: Ard Biesheuvel Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/memattr.c | 7 +++---- include/linux/efi.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c index c38b1a335590..e727cc5909cb 100644 --- a/drivers/firmware/efi/memattr.c +++ b/drivers/firmware/efi/memattr.c @@ -19,19 +19,19 @@ unsigned long __ro_after_init efi_mem_attr_table = EFI_INVALID_TABLE_ADDR; * Reserve the memory associated with the Memory Attributes configuration * table, if it exists. */ -int __init efi_memattr_init(void) +void __init efi_memattr_init(void) { efi_memory_attributes_table_t *tbl; unsigned long size; if (efi_mem_attr_table == EFI_INVALID_TABLE_ADDR) - return 0; + return; tbl = early_memremap(efi_mem_attr_table, sizeof(*tbl)); if (!tbl) { pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", efi_mem_attr_table); - return -ENOMEM; + return; } if (tbl->version > 2) { @@ -61,7 +61,6 @@ int __init efi_memattr_init(void) unmap: early_memunmap(tbl, sizeof(*tbl)); - return 0; } /* diff --git a/include/linux/efi.h b/include/linux/efi.h index a98cc39e7aaa..0b9eb3d2ff97 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -772,7 +772,7 @@ extern unsigned long efi_mem_attr_table; */ typedef int (*efi_memattr_perm_setter)(struct mm_struct *, efi_memory_desc_t *, bool); -extern int efi_memattr_init(void); +extern void efi_memattr_init(void); extern int efi_memattr_apply_permissions(struct mm_struct *mm, efi_memattr_perm_setter fn); -- cgit v1.2.3 From a2860501203cf7a2116adf3bb4e4c456c5750872 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 15 Oct 2025 22:56:37 +0200 Subject: efi/runtime-wrappers: Keep track of the efi_runtime_lock owner The EFI runtime wrappers use a file local semaphore to serialize access to the EFI runtime services. This means that any calls to the arch wrappers around the runtime services will also be serialized, removing the need for redundant locking. For robustness, add a facility that allows those arch wrappers to assert that the semaphore was taken by the current task. Signed-off-by: Ard Biesheuvel Acked-by: Catalin Marinas Signed-off-by: Catalin Marinas --- drivers/firmware/efi/runtime-wrappers.c | 17 ++++++++++++++++- include/linux/efi.h | 2 ++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 708b777857d3..da8d29621644 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -202,6 +202,8 @@ void efi_call_virt_check_flags(unsigned long flags, const void *caller) */ static DEFINE_SEMAPHORE(efi_runtime_lock, 1); +static struct task_struct *efi_runtime_lock_owner; + /* * Expose the EFI runtime lock to the UV platform */ @@ -219,6 +221,8 @@ static void __nocfi efi_call_rts(struct work_struct *work) efi_status_t status = EFI_NOT_FOUND; unsigned long flags; + efi_runtime_lock_owner = current; + arch_efi_call_virt_setup(); flags = efi_call_virt_save_flags(); @@ -310,6 +314,7 @@ static void __nocfi efi_call_rts(struct work_struct *work) efi_rts_work.status = status; complete(&efi_rts_work.efi_rts_comp); + efi_runtime_lock_owner = NULL; } static efi_status_t __efi_queue_work(enum efi_rts_ids id, @@ -444,8 +449,10 @@ virt_efi_set_variable_nb(efi_char16_t *name, efi_guid_t *vendor, u32 attr, if (down_trylock(&efi_runtime_lock)) return EFI_NOT_READY; + efi_runtime_lock_owner = current; status = efi_call_virt_pointer(efi.runtime, set_variable, name, vendor, attr, data_size, data); + efi_runtime_lock_owner = NULL; up(&efi_runtime_lock); return status; } @@ -481,9 +488,11 @@ virt_efi_query_variable_info_nb(u32 attr, u64 *storage_space, if (down_trylock(&efi_runtime_lock)) return EFI_NOT_READY; + efi_runtime_lock_owner = current; status = efi_call_virt_pointer(efi.runtime, query_variable_info, attr, storage_space, remaining_space, max_variable_size); + efi_runtime_lock_owner = NULL; up(&efi_runtime_lock); return status; } @@ -509,12 +518,13 @@ virt_efi_reset_system(int reset_type, efi_status_t status, return; } + efi_runtime_lock_owner = current; arch_efi_call_virt_setup(); efi_rts_work.efi_rts_id = EFI_RESET_SYSTEM; arch_efi_call_virt(efi.runtime, reset_system, reset_type, status, data_size, data); arch_efi_call_virt_teardown(); - + efi_runtime_lock_owner = NULL; up(&efi_runtime_lock); } @@ -587,3 +597,8 @@ efi_call_acpi_prm_handler(efi_status_t (__efiapi *handler_addr)(u64, void *), } #endif + +void efi_runtime_assert_lock_held(void) +{ + WARN_ON(efi_runtime_lock_owner != current); +} diff --git a/include/linux/efi.h b/include/linux/efi.h index a98cc39e7aaa..b23ff8b83219 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1126,6 +1126,8 @@ static inline bool efi_runtime_disabled(void) { return true; } extern void efi_call_virt_check_flags(unsigned long flags, const void *caller); extern unsigned long efi_call_virt_save_flags(void); +void efi_runtime_assert_lock_held(void); + enum efi_secureboot_mode { efi_secureboot_mode_unset, efi_secureboot_mode_unknown, -- cgit v1.2.3 From 693d1eaca940f277af24c74873ef2313816ff444 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 11 Nov 2025 18:58:35 +0000 Subject: coresight: Change device mode to atomic type The device mode is defined as local type. This type cannot promise SMP-safe access. Change to atomic type and impose relax ordering, which ensures the SMP-safe synchronisation and the ordering between the mode setting and relevant operations. Fixes: 22fd532eaa0c ("coresight: etm3x: adding operation mode for etm_enable()") Reviewed-by: Mike Leach Tested-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20251111-arm_coresight_power_management_fix-v6-1-f55553b6c8b3@arm.com --- include/linux/coresight.h | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 56d0108658db..2b48be97fcd0 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -251,15 +251,11 @@ struct coresight_trace_id_map { * by @coresight_ops. * @access: Device i/o access abstraction for this device. * @dev: The device entity associated to this component. - * @mode: This tracer's mode, i.e sysFS, Perf or disabled. This is - * actually an 'enum cs_mode', but is stored in an atomic type. - * This is always accessed through local_read() and local_set(), - * but wherever it's done from within the Coresight device's lock, - * a non-atomic read would also work. This is the main point of - * synchronisation between code happening inside the sysfs mode's - * coresight_mutex and outside when running in Perf mode. A compare - * and exchange swap is done to atomically claim one mode or the - * other. + * @mode: The device mode, i.e sysFS, Perf or disabled. This is actually + * an 'enum cs_mode' but stored in an atomic type. Access is always + * through atomic APIs, ensuring SMP-safe synchronisation between + * racing from sysFS and Perf mode. A compare-and-exchange + * operation is done to atomically claim one mode or the other. * @refcnt: keep track of what is in use. Only access this outside of the * device's spinlock when the coresight_mutex held and mode == * CS_MODE_SYSFS. Otherwise it must be accessed from inside the @@ -288,7 +284,7 @@ struct coresight_device { const struct coresight_ops *ops; struct csdev_access access; struct device dev; - local_t mode; + atomic_t mode; int refcnt; bool orphan; /* sink specific fields */ @@ -624,13 +620,14 @@ static inline bool coresight_is_percpu_sink(struct coresight_device *csdev) static inline bool coresight_take_mode(struct coresight_device *csdev, enum cs_mode new_mode) { - return local_cmpxchg(&csdev->mode, CS_MODE_DISABLED, new_mode) == - CS_MODE_DISABLED; + int curr = CS_MODE_DISABLED; + + return atomic_try_cmpxchg_acquire(&csdev->mode, &curr, new_mode); } static inline enum cs_mode coresight_get_mode(struct coresight_device *csdev) { - return local_read(&csdev->mode); + return atomic_read_acquire(&csdev->mode); } static inline void coresight_set_mode(struct coresight_device *csdev, @@ -646,7 +643,7 @@ static inline void coresight_set_mode(struct coresight_device *csdev, WARN(new_mode != CS_MODE_DISABLED && current_mode != CS_MODE_DISABLED && current_mode != new_mode, "Device already in use\n"); - local_set(&csdev->mode, new_mode); + atomic_set_release(&csdev->mode, new_mode); } struct coresight_device *coresight_register(struct coresight_desc *desc); -- cgit v1.2.3 From bb8336a5163a5839476f27ed1ad69df4a19e13ca Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Mon, 10 Nov 2025 18:25:45 +0000 Subject: ethtool: fix incorrect kernel-doc style comment in ethtool.h Building documentation produced the following warning: WARNING: ./include/linux/ethtool.h:495 This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for This comment was not intended to be parsed as kernel-doc, so replace the '/**' with '/*' to silence the warning and align with normal comment style in header files. No functional changes. Signed-off-by: Kriish Sharma Link: https://patch.msgid.link/20251110182545.2112596-1-kriish.sharma2006@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c2d8b4ec62eb..5c9162193d26 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -492,7 +492,7 @@ struct ethtool_pause_stats { }; #define ETHTOOL_MAX_LANES 8 -/** +/* * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for * the end-of-list marker, total 17 items */ -- cgit v1.2.3 From 5422318e27d7a4662701f518e2e51b9f73a331b1 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 11 Nov 2025 14:24:48 +0200 Subject: net/mlx5: Expose definition for 1600Gbps link mode This patch exposes new link mode for 1600Gbps, utilizing 8 lanes at 200Gbps per lane. Co-developed-by: Yael Chemla Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1762863888-1092798-1-git-send-email-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/port.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 58770b86f793..1df9d9a57bbc 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -112,6 +112,7 @@ enum mlx5e_ext_link_mode { MLX5E_400GAUI_2_400GBASE_CR2_KR2 = 17, MLX5E_800GAUI_8_800GBASE_CR8_KR8 = 19, MLX5E_800GAUI_4_800GBASE_CR4_KR4 = 20, + MLX5E_1600TAUI_8_1600TBASE_CR8_KR8 = 23, MLX5E_EXT_LINK_MODES_NUMBER, }; -- cgit v1.2.3 From 4be9f3cc582a24b08f6580f65fa48a4d70332ab5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:43 -0500 Subject: filelock: rework the __break_lease API to use flags Currently __break_lease takes both a type and an openmode. With the addition of directory leases, that makes less sense. Declare a set of LEASE_BREAK_* flags that can be used to control how lease breaks work instead of requiring a type and an openmode. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-2-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- fs/locks.c | 29 +++++++++++++++++---------- include/linux/filelock.h | 52 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 56 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/fs/locks.c b/fs/locks.c index b33c327c21dc..3cdd84a0fbed 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1529,24 +1529,31 @@ any_leases_conflict(struct inode *inode, struct file_lease *breaker) /** * __break_lease - revoke all outstanding leases on file * @inode: the inode of the file to return - * @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR: - * break all leases - * @type: FL_LEASE: break leases and delegations; FL_DELEG: break - * only delegations + * @flags: LEASE_BREAK_* flags * * break_lease (inlined for speed) has checked there already is at least * some kind of lock (maybe a lease) on this file. Leases are broken on - * a call to open() or truncate(). This function can sleep unless you - * specified %O_NONBLOCK to your open(). + * a call to open() or truncate(). This function can block waiting for the + * lease break unless you specify LEASE_BREAK_NONBLOCK. */ -int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +int __break_lease(struct inode *inode, unsigned int flags) { - int error = 0; - struct file_lock_context *ctx; struct file_lease *new_fl, *fl, *tmp; + struct file_lock_context *ctx; unsigned long break_time; - int want_write = (mode & O_ACCMODE) != O_RDONLY; + unsigned int type; LIST_HEAD(dispose); + bool want_write = !(flags & LEASE_BREAK_OPEN_RDONLY); + int error = 0; + + if (flags & LEASE_BREAK_LEASE) + type = FL_LEASE; + else if (flags & LEASE_BREAK_DELEG) + type = FL_DELEG; + else if (flags & LEASE_BREAK_LAYOUT) + type = FL_LAYOUT; + else + return -EINVAL; new_fl = lease_alloc(NULL, type, want_write ? F_WRLCK : F_RDLCK); if (IS_ERR(new_fl)) @@ -1595,7 +1602,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) if (list_empty(&ctx->flc_lease)) goto out; - if (mode & O_NONBLOCK) { + if (flags & LEASE_BREAK_NONBLOCK) { trace_break_lease_noblock(inode, new_fl); error = -EWOULDBLOCK; goto out; diff --git a/include/linux/filelock.h b/include/linux/filelock.h index c2ce8ba05d06..47da6aa28d8d 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -212,7 +212,14 @@ int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); void locks_init_lease(struct file_lease *); void locks_free_lease(struct file_lease *fl); struct file_lease *locks_alloc_lease(void); -int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); + +#define LEASE_BREAK_LEASE BIT(0) // break leases and delegations +#define LEASE_BREAK_DELEG BIT(1) // break delegations only +#define LEASE_BREAK_LAYOUT BIT(2) // break layouts only +#define LEASE_BREAK_NONBLOCK BIT(3) // non-blocking break +#define LEASE_BREAK_OPEN_RDONLY BIT(4) // readonly open event + +int __break_lease(struct inode *inode, unsigned int flags); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lease **, void **priv); int kernel_setlease(struct file *, int, struct file_lease **, void **); @@ -367,7 +374,7 @@ static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *f return -ENOLCK; } -static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +static inline int __break_lease(struct inode *inode, unsigned int flags) { return 0; } @@ -428,6 +435,17 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) } #ifdef CONFIG_FILE_LOCKING +static inline unsigned int openmode_to_lease_flags(unsigned int mode) +{ + unsigned int flags = 0; + + if ((mode & O_ACCMODE) == O_RDONLY) + flags |= LEASE_BREAK_OPEN_RDONLY; + if (mode & O_NONBLOCK) + flags |= LEASE_BREAK_NONBLOCK; + return flags; +} + static inline int break_lease(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; @@ -443,11 +461,11 @@ static inline int break_lease(struct inode *inode, unsigned int mode) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) - return __break_lease(inode, mode, FL_LEASE); + return __break_lease(inode, LEASE_BREAK_LEASE | openmode_to_lease_flags(mode)); return 0; } -static inline int break_deleg(struct inode *inode, unsigned int mode) +static inline int break_deleg(struct inode *inode, unsigned int flags) { struct file_lock_context *flctx; @@ -461,8 +479,10 @@ static inline int break_deleg(struct inode *inode, unsigned int mode) if (!flctx) return 0; smp_mb(); - if (!list_empty_careful(&flctx->flc_lease)) - return __break_lease(inode, mode, FL_DELEG); + if (!list_empty_careful(&flctx->flc_lease)) { + flags |= LEASE_BREAK_DELEG; + return __break_lease(inode, flags); + } return 0; } @@ -470,7 +490,7 @@ static inline int try_break_deleg(struct inode *inode, struct inode **delegated_ { int ret; - ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); + ret = break_deleg(inode, LEASE_BREAK_NONBLOCK); if (ret == -EWOULDBLOCK && delegated_inode) { *delegated_inode = inode; ihold(inode); @@ -482,7 +502,7 @@ static inline int break_deleg_wait(struct inode **delegated_inode) { int ret; - ret = break_deleg(*delegated_inode, O_WRONLY); + ret = break_deleg(*delegated_inode, 0); iput(*delegated_inode); *delegated_inode = NULL; return ret; @@ -491,20 +511,24 @@ static inline int break_deleg_wait(struct inode **delegated_inode) static inline int break_layout(struct inode *inode, bool wait) { smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) - return __break_lease(inode, - wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, - FL_LAYOUT); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) { + unsigned int flags = LEASE_BREAK_LAYOUT; + + if (!wait) + flags |= LEASE_BREAK_NONBLOCK; + + return __break_lease(inode, flags); + } return 0; } #else /* !CONFIG_FILE_LOCKING */ -static inline int break_lease(struct inode *inode, unsigned int mode) +static inline int break_lease(struct inode *inode, bool wait) { return 0; } -static inline int break_deleg(struct inode *inode, unsigned int mode) +static inline int break_deleg(struct inode *inode, unsigned int flags) { return 0; } -- cgit v1.2.3 From 6976ed2dd0d59086d16d853ac9b21776be68aaad Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:44 -0500 Subject: filelock: add struct delegated_inode The current API requires a pointer to an inode pointer. It's easy for callers to get this wrong. Add a new delegated_inode structure and use that to pass back any inode that needs to be waited on. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-3-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- fs/attr.c | 2 +- fs/namei.c | 18 +++++++++--------- fs/open.c | 8 ++++---- fs/posix_acl.c | 8 ++++---- fs/utimes.c | 4 ++-- fs/xattr.c | 12 ++++++------ include/linux/filelock.h | 36 +++++++++++++++++++++++++++--------- include/linux/fs.h | 9 +++++---- include/linux/xattr.h | 4 ++-- 9 files changed, 60 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/fs/attr.c b/fs/attr.c index 795f231d00e8..b9ec6b47bab2 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -415,7 +415,7 @@ EXPORT_SYMBOL(may_setattr); * performed on the raw inode simply pass @nop_mnt_idmap. */ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr, struct inode **delegated_inode) + struct iattr *attr, struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; diff --git a/fs/namei.c b/fs/namei.c index 7377020a2cba..bf42f146f847 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4648,7 +4648,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, struct inode **delegated_inode) + struct dentry *dentry, struct delegated_inode *delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(idmap, dir, dentry, 0); @@ -4706,7 +4706,7 @@ int do_unlinkat(int dfd, struct filename *name) struct qstr last; int type; struct inode *inode = NULL; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); @@ -4743,7 +4743,7 @@ exit3: if (inode) iput(inode); /* truncate the inode here */ inode = NULL; - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -4892,7 +4892,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, - struct inode **delegated_inode) + struct delegated_inode *delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -4968,7 +4968,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int how = 0; int error; @@ -5012,7 +5012,7 @@ retry: new_dentry, &delegated_inode); out_dput: end_creating_path(&new_path, new_dentry); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); @@ -5098,7 +5098,7 @@ int vfs_rename(struct renamedata *rd) struct inode *new_dir = d_inode(rd->new_parent); struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; - struct inode **delegated_inode = rd->delegated_inode; + struct delegated_inode *delegated_inode = rd->delegated_inode; unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; @@ -5261,7 +5261,7 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; bool should_retry = false; @@ -5369,7 +5369,7 @@ exit4: exit3: unlock_rename(new_path.dentry, old_path.dentry); exit_lock_rename: - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/fs/open.c b/fs/open.c index 3d64372ecc67..fdaa6f08f6f4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -631,7 +631,7 @@ out: int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; struct iattr newattrs; int error; @@ -651,7 +651,7 @@ retry_deleg: &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -756,7 +756,7 @@ int chown_common(const struct path *path, uid_t user, gid_t group) struct mnt_idmap *idmap; struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int error; struct iattr newattrs; kuid_t uid; @@ -791,7 +791,7 @@ retry_deleg: error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 4050942ab52f..768f027c1428 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1091,7 +1091,7 @@ int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, int acl_type; int error; struct inode *inode = d_inode(dentry); - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; acl_type = posix_acl_type(acl_name); if (acl_type < 0) @@ -1141,7 +1141,7 @@ retry_deleg: out_inode_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -1212,7 +1212,7 @@ int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, int acl_type; int error; struct inode *inode = d_inode(dentry); - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; acl_type = posix_acl_type(acl_name); if (acl_type < 0) @@ -1249,7 +1249,7 @@ retry_deleg: out_inode_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/fs/utimes.c b/fs/utimes.c index c7c7958e57b2..bf9f45bdef54 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -22,7 +22,7 @@ int vfs_utimes(const struct path *path, struct timespec64 *times) int error; struct iattr newattrs; struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; if (times) { if (!nsec_valid(times[0].tv_nsec) || @@ -66,7 +66,7 @@ retry_deleg: error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/fs/xattr.c b/fs/xattr.c index 8851a5ef34f5..32d445fb60aa 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -274,7 +274,7 @@ int __vfs_setxattr_noperm(struct mnt_idmap *idmap, int __vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, - int flags, struct inode **delegated_inode) + int flags, struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; int error; @@ -305,7 +305,7 @@ vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; const void *orig_value = value; int error; @@ -322,7 +322,7 @@ retry_deleg: flags, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -533,7 +533,7 @@ EXPORT_SYMBOL(__vfs_removexattr); int __vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, - struct inode **delegated_inode) + struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; int error; @@ -567,7 +567,7 @@ vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int error; retry_deleg: @@ -576,7 +576,7 @@ retry_deleg: name, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 47da6aa28d8d..208d108df2d7 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -486,25 +486,35 @@ static inline int break_deleg(struct inode *inode, unsigned int flags) return 0; } -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +struct delegated_inode { + struct inode *di_inode; +}; + +static inline bool is_delegated(struct delegated_inode *di) +{ + return di->di_inode; +} + +static inline int try_break_deleg(struct inode *inode, + struct delegated_inode *di) { int ret; ret = break_deleg(inode, LEASE_BREAK_NONBLOCK); - if (ret == -EWOULDBLOCK && delegated_inode) { - *delegated_inode = inode; + if (ret == -EWOULDBLOCK && di) { + di->di_inode = inode; ihold(inode); } return ret; } -static inline int break_deleg_wait(struct inode **delegated_inode) +static inline int break_deleg_wait(struct delegated_inode *di) { int ret; - ret = break_deleg(*delegated_inode, 0); - iput(*delegated_inode); - *delegated_inode = NULL; + ret = break_deleg(di->di_inode, 0); + iput(di->di_inode); + di->di_inode = NULL; return ret; } @@ -523,6 +533,13 @@ static inline int break_layout(struct inode *inode, bool wait) } #else /* !CONFIG_FILE_LOCKING */ +struct delegated_inode { }; + +static inline bool is_delegated(struct delegated_inode *di) +{ + return false; +} + static inline int break_lease(struct inode *inode, bool wait) { return 0; @@ -533,12 +550,13 @@ static inline int break_deleg(struct inode *inode, unsigned int flags) return 0; } -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +static inline int try_break_deleg(struct inode *inode, + struct delegated_inode *delegated_inode) { return 0; } -static inline int break_deleg_wait(struct inode **delegated_inode) +static inline int break_deleg_wait(struct delegated_inode *delegated_inode) { BUG(); return 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..909a88e3979d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -80,6 +80,7 @@ struct fs_context; struct fs_parameter_spec; struct file_kattr; struct iomap_ops; +struct delegated_inode; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2119,10 +2120,10 @@ int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, - struct dentry *, struct inode **); + struct dentry *, struct delegated_inode *); int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, - struct inode **); + struct delegated_inode *); /** * struct renamedata - contains all information required for renaming @@ -2140,7 +2141,7 @@ struct renamedata { struct dentry *old_dentry; struct dentry *new_parent; struct dentry *new_dentry; - struct inode **delegated_inode; + struct delegated_inode *delegated_inode; unsigned int flags; } __randomize_layout; @@ -3071,7 +3072,7 @@ static inline int bmap(struct inode *inode, sector_t *block) #endif int notify_change(struct mnt_idmap *, struct dentry *, - struct iattr *, struct inode **); + struct iattr *, struct delegated_inode *); int inode_permission(struct mnt_idmap *, struct inode *, int); int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 86b0d47984a1..64e9afe7d647 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -85,12 +85,12 @@ int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, - struct inode **); + struct delegated_inode *); int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, - const char *, struct inode **); + const char *, struct delegated_inode *); int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); -- cgit v1.2.3 From e12d203b8c880061c0bf0339cad51e5851a33442 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:47 -0500 Subject: vfs: allow mkdir to wait for delegation break on parent In order to add directory delegation support, we need to break delegations on the parent whenever there is going to be a change in the directory. Add a new delegated_inode parameter to vfs_mkdir. All of the existing callers set that to NULL for now, except for do_mkdirat which will properly block until the lease is gone. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-6-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- drivers/base/devtmpfs.c | 2 +- fs/cachefiles/namei.c | 2 +- fs/ecryptfs/inode.c | 2 +- fs/init.c | 2 +- fs/namei.c | 24 ++++++++++++++++++------ fs/nfsd/nfs4recover.c | 2 +- fs/nfsd/vfs.c | 2 +- fs/overlayfs/overlayfs.h | 2 +- fs/smb/server/vfs.c | 2 +- fs/xfs/scrub/orphanage.c | 2 +- include/linux/fs.h | 2 +- 11 files changed, 28 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 9d4e46ad8352..0e79621cb0f7 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -180,7 +180,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (IS_ERR(dentry)) return PTR_ERR(dentry); - dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, NULL); if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d1edb2ac3837..50c0f9c76d1f 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -130,7 +130,7 @@ retry: goto mkdir_error; ret = cachefiles_inject_write_error(); if (ret == 0) - subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); + subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700, NULL); else subdir = ERR_PTR(ret); if (IS_ERR(subdir)) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ed1394da8d6b..35830b3144f8 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -508,7 +508,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto out; lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir, - lower_dentry, mode); + lower_dentry, mode, NULL); rc = PTR_ERR(lower_dentry); if (IS_ERR(lower_dentry)) goto out; diff --git a/fs/init.c b/fs/init.c index 07f592ccdba8..895f8a09a71a 100644 --- a/fs/init.c +++ b/fs/init.c @@ -233,7 +233,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) error = security_path_mkdir(&path, dentry, mode); if (!error) { dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, mode); + dentry, mode, NULL); if (IS_ERR(dentry)) error = PTR_ERR(dentry); } diff --git a/fs/namei.c b/fs/namei.c index 5bcf3e93d350..76c0587d991f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4407,10 +4407,11 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d /** * vfs_mkdir - create directory returning correct dentry if possible - * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory - * @dentry: dentry of the child directory - * @mode: mode of the child directory + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @mode: mode of the child directory + * @delegated_inode: returns parent inode, if the inode is delegated. * * Create a directory. * @@ -4427,7 +4428,8 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d * In case of an error the dentry is dput() and an ERR_PTR() is returned. */ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct dentry *dentry, umode_t mode, + struct delegated_inode *delegated_inode) { int error; unsigned max_links = dir->i_sb->s_max_links; @@ -4450,6 +4452,10 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (max_links && dir->i_nlink >= max_links) goto err; + error = try_break_deleg(dir, delegated_inode); + if (error) + goto err; + de = dir->i_op->mkdir(idmap, dir, dentry, mode); error = PTR_ERR(de); if (IS_ERR(de)) @@ -4473,6 +4479,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode) struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; + struct delegated_inode delegated_inode = { }; retry: dentry = filename_create(dfd, name, &path, lookup_flags); @@ -4484,11 +4491,16 @@ retry: mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, mode); + dentry, mode, &delegated_inode); if (IS_ERR(dentry)) error = PTR_ERR(dentry); } end_creating_path(&path, dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e2b9472e5c78..1f56834b2072 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -213,7 +213,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, 0700, NULL); if (IS_ERR(dentry)) status = PTR_ERR(dentry); out_put: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9cb20d4aeab1..97aef140cbf5 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1558,7 +1558,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); + dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode, NULL); if (IS_ERR(dchild)) { host_err = PTR_ERR(dchild); } else if (d_is_negative(dchild)) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index c8fd5951fc5e..0f65f9a5d54d 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -248,7 +248,7 @@ static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs, { struct dentry *ret; - ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); + ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, NULL); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret)); return ret; } diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 891ed2dc2b73..3d2190f26623 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -230,7 +230,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; d = dentry; - dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); + dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode, NULL); if (IS_ERR(dentry)) err = PTR_ERR(dentry); else if (d_is_negative(dentry)) diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 9c12cb844231..91c9d07b97f3 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -167,7 +167,7 @@ xrep_orphanage_create( */ if (d_really_is_negative(orphanage_dentry)) { orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, - orphanage_dentry, 0750); + orphanage_dentry, 0750, NULL); error = PTR_ERR(orphanage_dentry); if (IS_ERR(orphanage_dentry)) goto out_unlock_root; diff --git a/include/linux/fs.h b/include/linux/fs.h index 909a88e3979d..20bb4c8a4e8e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2114,7 +2114,7 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap, int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t); + struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int vfs_symlink(struct mnt_idmap *, struct inode *, -- cgit v1.2.3 From 4fa76319cd0cc97ca54ff71c94814dc5b1983ad2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:48 -0500 Subject: vfs: allow rmdir to wait for delegation break on parent In order to add directory delegation support, we need to break delegations on the parent whenever there is going to be a change in the directory. Add a delegated_inode struct to vfs_rmdir() and populate that pointer with the parent inode if it's non-NULL. Most existing in-kernel callers pass in a NULL pointer. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-7-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- drivers/base/devtmpfs.c | 2 +- fs/ecryptfs/inode.c | 2 +- fs/namei.c | 22 +++++++++++++++++----- fs/nfsd/nfs4recover.c | 4 ++-- fs/nfsd/vfs.c | 2 +- fs/overlayfs/overlayfs.h | 2 +- fs/smb/server/vfs.c | 4 ++-- include/linux/fs.h | 3 ++- 8 files changed, 27 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 0e79621cb0f7..104025104ef7 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -261,7 +261,7 @@ static int dev_rmdir(const char *name) return PTR_ERR(dentry); if (d_inode(dentry)->i_private == &thread) err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), - dentry); + dentry, NULL); else err = -EPERM; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 35830b3144f8..88631291b325 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -540,7 +540,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry); + rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); } if (!rc) { clear_nlink(d_inode(dentry)); diff --git a/fs/namei.c b/fs/namei.c index 76c0587d991f..9e0393a92091 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4522,9 +4522,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) /** * vfs_rmdir - remove directory - * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory - * @dentry: dentry of the child directory + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @delegated_inode: returns parent inode, if it's delegated. * * Remove a directory. * @@ -4535,7 +4536,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry) + struct dentry *dentry, struct delegated_inode *delegated_inode) { int error = may_delete(idmap, dir, dentry, 1); @@ -4557,6 +4558,10 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, if (error) goto out; + error = try_break_deleg(dir, delegated_inode); + if (error) + goto out; + error = dir->i_op->rmdir(dir, dentry); if (error) goto out; @@ -4583,6 +4588,7 @@ int do_rmdir(int dfd, struct filename *name) struct qstr last; int type; unsigned int lookup_flags = 0; + struct delegated_inode delegated_inode = { }; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) @@ -4612,7 +4618,8 @@ retry: error = security_path_rmdir(&path, dentry); if (error) goto exit4; - error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); + error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); exit4: dput(dentry); exit3: @@ -4620,6 +4627,11 @@ exit3: mnt_drop_write(path.mnt); exit2: path_put(&path); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 1f56834b2072..30bae93931d9 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -337,7 +337,7 @@ nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn) status = -ENOENT; if (d_really_is_negative(dentry)) goto out; - status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry, NULL); out: dput(dentry); out_unlock: @@ -427,7 +427,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(name, nn)) goto out_free; - status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child, NULL); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 97aef140cbf5..c400ea94ff2e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2108,7 +2108,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, break; } } else { - host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry); + host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry, NULL); } fh_fill_post_attrs(fhp); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 0f65f9a5d54d..d215d7349489 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -206,7 +206,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, static inline int ovl_do_rmdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry); + int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 3d2190f26623..c5f0f3170d58 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -609,7 +609,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) idmap = mnt_idmap(path->mnt); if (S_ISDIR(d_inode(path->dentry)->i_mode)) { - err = vfs_rmdir(idmap, d_inode(parent), path->dentry); + err = vfs_rmdir(idmap, d_inode(parent), path->dentry, NULL); if (err && err != -ENOTEMPTY) ksmbd_debug(VFS, "rmdir failed, err %d\n", err); } else { @@ -1090,7 +1090,7 @@ int ksmbd_vfs_unlink(struct file *filp) dget(dentry); if (S_ISDIR(d_inode(dentry)->i_mode)) - err = vfs_rmdir(idmap, d_inode(dir), dentry); + err = vfs_rmdir(idmap, d_inode(dir), dentry, NULL); else err = vfs_unlink(idmap, d_inode(dir), dentry, NULL); diff --git a/include/linux/fs.h b/include/linux/fs.h index 20bb4c8a4e8e..12873214e1c7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2121,7 +2121,8 @@ int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct delegated_inode *); -int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); +int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *, + struct delegated_inode *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, struct delegated_inode *); -- cgit v1.2.3 From 85bbffcad7307e2ca6136be657cc21b0e1c42241 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:50 -0500 Subject: vfs: clean up argument list for vfs_create() As Neil points out: "I would be in favour of dropping the "dir" arg because it is always d_inode(dentry->d_parent) which is stable." ...and... "Also *every* caller of vfs_create() passes ".excl = true". So maybe we don't need that arg at all." Drop both arguments from vfs_create() and fix up the callers. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-9-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- fs/ecryptfs/inode.c | 3 +-- fs/namei.c | 11 ++++------- fs/nfsd/nfs3proc.c | 2 +- fs/nfsd/vfs.c | 3 +-- fs/open.c | 4 +--- fs/overlayfs/overlayfs.h | 2 +- fs/smb/server/vfs.c | 3 +-- include/linux/fs.h | 3 +-- 8 files changed, 11 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 88631291b325..d109e3763a88 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -188,8 +188,7 @@ ecryptfs_do_create(struct inode *directory_inode, rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_create(&nop_mnt_idmap, lower_dir, - lower_dentry, mode, true); + rc = vfs_create(&nop_mnt_idmap, lower_dentry, mode); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); diff --git a/fs/namei.c b/fs/namei.c index f439429bdfa2..9586c6aba6ea 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3461,10 +3461,8 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory * @dentry: dentry of the child file * @mode: mode of the child file - * @want_excl: whether the file must not yet exist * * Create a new file. * @@ -3474,9 +3472,9 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ -int vfs_create(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, bool want_excl) +int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { + struct inode *dir = d_inode(dentry->d_parent); int error; error = may_create(idmap, dir, dentry); @@ -3490,7 +3488,7 @@ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); + error = dir->i_op->create(idmap, dir, dentry, mode, true); if (!error) fsnotify_create(dir, dentry); return error; @@ -4383,8 +4381,7 @@ retry: idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(idmap, path.dentry->d_inode, - dentry, mode, true); + error = vfs_create(idmap, dentry, mode); if (!error) security_path_post_mknod(idmap, dentry); break; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index b6d03e1ef5f7..30ea7ffa2aff 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -344,7 +344,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, status = fh_fill_pre_attrs(fhp); if (status != nfs_ok) goto out; - host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, child, iap->ia_mode); if (host_err < 0) { status = nfserrno(host_err); goto out; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c400ea94ff2e..464fd54675f3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1552,8 +1552,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(&nop_mnt_idmap, dirp, dchild, - iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, dchild, iap->ia_mode); if (!host_err) nfsd_check_ignore_resizing(iap); break; diff --git a/fs/open.c b/fs/open.c index fdaa6f08f6f4..e440f58e3ce8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1171,9 +1171,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, if (IS_ERR(f)) return f; - error = vfs_create(mnt_idmap(path->mnt), - d_inode(path->dentry->d_parent), - path->dentry, mode, true); + error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode); if (!error) error = vfs_open(path, f); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index d215d7349489..2bdc434941eb 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -235,7 +235,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true); + int err = vfs_create(ovl_upper_mnt_idmap(ofs), dentry, mode); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index c5f0f3170d58..83ece2de4b23 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -188,8 +188,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } mode |= S_IFREG; - err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), - dentry, mode, true); + err = vfs_create(mnt_idmap(path.mnt), dentry, mode); if (!err) { ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 12873214e1c7..21876ef1fec9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2111,8 +2111,7 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap, /* * VFS helper functions.. */ -int vfs_create(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t, bool); +int vfs_create(struct mnt_idmap *, struct dentry *, umode_t); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, -- cgit v1.2.3 From c826229c6a82fe1fe7b7752692f87a881eb4b545 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:51 -0500 Subject: vfs: make vfs_create break delegations on parent directory In order to add directory delegation support, we need to break delegations on the parent whenever there is going to be a change in the directory. Add a delegated_inode parameter to vfs_create. Most callers are converted to pass in NULL, but do_mknodat() is changed to wait for a delegation break if there is one. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-10-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- fs/ecryptfs/inode.c | 2 +- fs/namei.c | 15 +++++++++++++-- fs/nfsd/nfs3proc.c | 2 +- fs/nfsd/vfs.c | 2 +- fs/open.c | 2 +- fs/overlayfs/overlayfs.h | 2 +- fs/smb/server/vfs.c | 2 +- include/linux/fs.h | 3 ++- 8 files changed, 21 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index d109e3763a88..3341f00dd087 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -188,7 +188,7 @@ ecryptfs_do_create(struct inode *directory_inode, rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_create(&nop_mnt_idmap, lower_dentry, mode); + rc = vfs_create(&nop_mnt_idmap, lower_dentry, mode, NULL); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); diff --git a/fs/namei.c b/fs/namei.c index 9586c6aba6ea..b20f053374a5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3463,6 +3463,7 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, * @idmap: idmap of the mount the inode was found from * @dentry: dentry of the child file * @mode: mode of the child file + * @di: returns parent inode, if the inode is delegated. * * Create a new file. * @@ -3472,7 +3473,8 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ -int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) +int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode, + struct delegated_inode *di) { struct inode *dir = d_inode(dentry->d_parent); int error; @@ -3486,6 +3488,9 @@ int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); + if (error) + return error; + error = try_break_deleg(dir, di); if (error) return error; error = dir->i_op->create(idmap, dir, dentry, mode, true); @@ -4358,6 +4363,7 @@ static int may_mknod(umode_t mode) static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { + struct delegated_inode di = { }; struct mnt_idmap *idmap; struct dentry *dentry; struct path path; @@ -4381,7 +4387,7 @@ retry: idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(idmap, dentry, mode); + error = vfs_create(idmap, dentry, mode, &di); if (!error) security_path_post_mknod(idmap, dentry); break; @@ -4396,6 +4402,11 @@ retry: } out2: end_creating_path(&path, dentry); + if (is_delegated(&di)) { + error = break_deleg_wait(&di); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 30ea7ffa2aff..2cb972b5ed99 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -344,7 +344,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, status = fh_fill_pre_attrs(fhp); if (status != nfs_ok) goto out; - host_err = vfs_create(&nop_mnt_idmap, child, iap->ia_mode); + host_err = vfs_create(&nop_mnt_idmap, child, iap->ia_mode, NULL); if (host_err < 0) { status = nfserrno(host_err); goto out; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 464fd54675f3..de5f46f8c6d3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1552,7 +1552,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(&nop_mnt_idmap, dchild, iap->ia_mode); + host_err = vfs_create(&nop_mnt_idmap, dchild, iap->ia_mode, NULL); if (!host_err) nfsd_check_ignore_resizing(iap); break; diff --git a/fs/open.c b/fs/open.c index e440f58e3ce8..92cf2e11781b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1171,7 +1171,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, if (IS_ERR(f)) return f; - error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode); + error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode, NULL); if (!error) error = vfs_open(path, f); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 2bdc434941eb..e30441cc9c63 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -235,7 +235,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(ovl_upper_mnt_idmap(ofs), dentry, mode); + int err = vfs_create(ovl_upper_mnt_idmap(ofs), dentry, mode, NULL); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 83ece2de4b23..3747851b61c8 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -188,7 +188,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } mode |= S_IFREG; - err = vfs_create(mnt_idmap(path.mnt), dentry, mode); + err = vfs_create(mnt_idmap(path.mnt), dentry, mode, NULL); if (!err) { ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 21876ef1fec9..83b05aec4e10 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2111,7 +2111,8 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap, /* * VFS helper functions.. */ -int vfs_create(struct mnt_idmap *, struct dentry *, umode_t); +int vfs_create(struct mnt_idmap *, struct dentry *, umode_t, + struct delegated_inode *); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, -- cgit v1.2.3 From e8960c1b2ee9ba75d65492b8e90e851d11e5f215 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:52 -0500 Subject: vfs: make vfs_mknod break delegations on parent directory In order to add directory delegation support, we need to break delegations on the parent whenever there is going to be a change in the directory. Add a new delegated_inode pointer to vfs_mknod() and have the appropriate callers wait when there is an outstanding delegation. All other callers just set the pointer to NULL. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-11-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- drivers/base/devtmpfs.c | 2 +- fs/ecryptfs/inode.c | 2 +- fs/init.c | 2 +- fs/namei.c | 23 +++++++++++++++-------- fs/nfsd/vfs.c | 2 +- fs/overlayfs/overlayfs.h | 2 +- include/linux/fs.h | 4 ++-- net/unix/af_unix.c | 2 +- 8 files changed, 23 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 104025104ef7..2f576ecf1832 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -231,7 +231,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, return PTR_ERR(dentry); err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, - dev->devt); + dev->devt, NULL); if (!err) { struct iattr newattrs; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 3341f00dd087..83f06452476d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -564,7 +564,7 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, rc = lock_parent(dentry, &lower_dentry, &lower_dir); if (!rc) rc = vfs_mknod(&nop_mnt_idmap, lower_dir, - lower_dentry, mode, dev); + lower_dentry, mode, dev, NULL); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); diff --git a/fs/init.c b/fs/init.c index 895f8a09a71a..4f02260dd65b 100644 --- a/fs/init.c +++ b/fs/init.c @@ -157,7 +157,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) error = security_path_mknod(&path, dentry, mode, dev); if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, mode, new_decode_dev(dev)); + dentry, mode, new_decode_dev(dev), NULL); end_creating_path(&path, dentry); return error; } diff --git a/fs/namei.c b/fs/namei.c index b20f053374a5..e9616134390f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4295,13 +4295,15 @@ inline struct dentry *start_creating_user_path( } EXPORT_SYMBOL(start_creating_user_path); + /** * vfs_mknod - create device node or file - * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory - * @dentry: dentry of the child device node - * @mode: mode of the child device node - * @dev: device number of device to create + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child device node + * @mode: mode of the child device node + * @dev: device number of device to create + * @delegated_inode: returns parent inode, if the inode is delegated. * * Create a device node or file. * @@ -4312,7 +4314,8 @@ EXPORT_SYMBOL(start_creating_user_path); * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) + struct dentry *dentry, umode_t mode, dev_t dev, + struct delegated_inode *delegated_inode) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(idmap, dir, dentry); @@ -4336,6 +4339,10 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; + error = try_break_deleg(dir, delegated_inode); + if (error) + return error; + error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); @@ -4393,11 +4400,11 @@ retry: break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, - dentry, mode, new_decode_dev(dev)); + dentry, mode, new_decode_dev(dev), &di); break; case S_IFIFO: case S_IFSOCK: error = vfs_mknod(idmap, path.dentry->d_inode, - dentry, mode, 0); + dentry, mode, 0, &di); break; } out2: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index de5f46f8c6d3..6684935007b1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1573,7 +1573,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFIFO: case S_IFSOCK: host_err = vfs_mknod(&nop_mnt_idmap, dirp, dchild, - iap->ia_mode, rdev); + iap->ia_mode, rdev, NULL); break; default: printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e30441cc9c63..afd95721f76e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -257,7 +257,7 @@ static inline int ovl_do_mknod(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev); + int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev, NULL); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; diff --git a/include/linux/fs.h b/include/linux/fs.h index 83b05aec4e10..1a5d86cfafaa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2116,7 +2116,7 @@ int vfs_create(struct mnt_idmap *, struct dentry *, umode_t, struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t, dev_t); + umode_t, dev_t, struct delegated_inode *); int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, @@ -2152,7 +2152,7 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, - WHITEOUT_DEV); + WHITEOUT_DEV, NULL); } struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 768098dec231..db1fd8d6a84c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1399,7 +1399,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, idmap = mnt_idmap(parent.mnt); err = security_path_mknod(&parent, dentry, mode, 0); if (!err) - err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); + err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0, NULL); if (err) goto out_path; err = mutex_lock_interruptible(&u->bindlock); -- cgit v1.2.3 From 92bf53577f01aad988f7f39f69163b41f94cfb7d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:53 -0500 Subject: vfs: make vfs_symlink break delegations on parent dir In order to add directory delegation support, we must break delegations on the parent on any change to the directory. Add a delegated_inode parameter to vfs_symlink() and have it break the delegation. do_symlinkat() can then wait on the delegation break before proceeding. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-12-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- fs/ecryptfs/inode.c | 2 +- fs/init.c | 2 +- fs/namei.c | 16 ++++++++++++++-- fs/nfsd/vfs.c | 2 +- fs/overlayfs/overlayfs.h | 2 +- include/linux/fs.h | 2 +- 6 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 83f06452476d..ba15e7359dfa 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -479,7 +479,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap, if (rc) goto out_lock; rc = vfs_symlink(&nop_mnt_idmap, lower_dir, lower_dentry, - encoded_symname); + encoded_symname, NULL); kfree(encoded_symname); if (rc || d_really_is_negative(lower_dentry)) goto out_lock; diff --git a/fs/init.c b/fs/init.c index 4f02260dd65b..e0f5429c0a49 100644 --- a/fs/init.c +++ b/fs/init.c @@ -209,7 +209,7 @@ int __init init_symlink(const char *oldname, const char *newname) error = security_path_symlink(&path, dentry, oldname); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, oldname); + dentry, oldname, NULL); end_creating_path(&path, dentry); return error; } diff --git a/fs/namei.c b/fs/namei.c index e9616134390f..d5ab28947b2b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4845,6 +4845,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) * @dir: inode of the parent directory * @dentry: dentry of the child symlink file * @oldname: name of the file to link to + * @delegated_inode: returns victim inode, if the inode is delegated. * * Create a symlink. * @@ -4855,7 +4856,8 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, const char *oldname) + struct dentry *dentry, const char *oldname, + struct delegated_inode *delegated_inode) { int error; @@ -4870,6 +4872,10 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; + error = try_break_deleg(dir, delegated_inode); + if (error) + return error; + error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); @@ -4883,6 +4889,7 @@ int do_symlinkat(struct filename *from, int newdfd, struct filename *to) struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; + struct delegated_inode delegated_inode = { }; if (IS_ERR(from)) { error = PTR_ERR(from); @@ -4897,8 +4904,13 @@ retry: error = security_path_symlink(&path, dentry, from->name); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, from->name); + dentry, from->name, &delegated_inode); end_creating_path(&path, dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6684935007b1..28710da4cce7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1742,7 +1742,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, err = fh_fill_pre_attrs(fhp); if (err != nfs_ok) goto out_unlock; - host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path); + host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path, NULL); err = nfserrno(host_err); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); if (!err) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index afd95721f76e..5065961bd370 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -267,7 +267,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, const char *oldname) { - int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname); + int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname, NULL); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a5d86cfafaa..64323e618724 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2118,7 +2118,7 @@ struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t, struct delegated_inode *); int vfs_symlink(struct mnt_idmap *, struct inode *, - struct dentry *, const char *); + struct dentry *, const char *, struct delegated_inode *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct delegated_inode *); int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *, -- cgit v1.2.3 From 1602bad16d7df82faca6d7c70821117684a66f49 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:58 -0500 Subject: vfs: expose delegation support to userland Now that support for recallable directory delegations is available, expose this functionality to userland with new F_SETDELEG and F_GETDELEG commands for fcntl(). Note that this also allows userland to request a FL_DELEG type lease on files too. Userland applications that do will get signalled when there are metadata changes in addition to just data changes (which is a limitation of FL_LEASE leases). These commands accept a new "struct delegation" argument that contains a flags field for future expansion. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-17-52f3feebb2f2@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fcntl.c | 13 +++++++++++++ fs/locks.c | 45 ++++++++++++++++++++++++++++++++++++++++----- include/linux/filelock.h | 12 ++++++++++++ include/uapi/linux/fcntl.h | 11 +++++++++++ 4 files changed, 76 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/fcntl.c b/fs/fcntl.c index 72f8433d9109..f93dbca08435 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -445,6 +445,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; + struct delegation deleg; int argi = (int)arg; struct flock flock; long err = -EINVAL; @@ -550,6 +551,18 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_SET_RW_HINT: err = fcntl_set_rw_hint(filp, arg); break; + case F_GETDELEG: + if (copy_from_user(&deleg, argp, sizeof(deleg))) + return -EFAULT; + err = fcntl_getdeleg(filp, &deleg); + if (!err && copy_to_user(argp, &deleg, sizeof(deleg))) + return -EFAULT; + break; + case F_SETDELEG: + if (copy_from_user(&deleg, argp, sizeof(deleg))) + return -EFAULT; + err = fcntl_setdeleg(fd, filp, &deleg); + break; default: break; } diff --git a/fs/locks.c b/fs/locks.c index dd290a87f58e..7f4ccc7974bc 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1703,7 +1703,7 @@ EXPORT_SYMBOL(lease_get_mtime); * XXX: sfr & willy disagree over whether F_INPROGRESS * should be returned to userspace. */ -int fcntl_getlease(struct file *filp) +static int __fcntl_getlease(struct file *filp, unsigned int flavor) { struct file_lease *fl; struct inode *inode = file_inode(filp); @@ -1719,7 +1719,8 @@ int fcntl_getlease(struct file *filp) list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { if (fl->c.flc_file != filp) continue; - type = target_leasetype(fl); + if (fl->c.flc_flags & flavor) + type = target_leasetype(fl); break; } spin_unlock(&ctx->flc_lock); @@ -1730,6 +1731,19 @@ int fcntl_getlease(struct file *filp) return type; } +int fcntl_getlease(struct file *filp) +{ + return __fcntl_getlease(filp, FL_LEASE); +} + +int fcntl_getdeleg(struct file *filp, struct delegation *deleg) +{ + if (deleg->d_flags != 0 || deleg->__pad != 0) + return -EINVAL; + deleg->d_type = __fcntl_getlease(filp, FL_DELEG); + return 0; +} + /** * check_conflicting_open - see if the given file points to an inode that has * an existing open that would conflict with the @@ -2039,13 +2053,13 @@ vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) } EXPORT_SYMBOL_GPL(vfs_setlease); -static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, unsigned int flavor, int arg) { struct file_lease *fl; struct fasync_struct *new; int error; - fl = lease_alloc(filp, FL_LEASE, arg); + fl = lease_alloc(filp, flavor, arg); if (IS_ERR(fl)) return PTR_ERR(fl); @@ -2081,7 +2095,28 @@ int fcntl_setlease(unsigned int fd, struct file *filp, int arg) if (arg == F_UNLCK) return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); - return do_fcntl_add_lease(fd, filp, arg); + return do_fcntl_add_lease(fd, filp, FL_LEASE, arg); +} + +/** + * fcntl_setdeleg - sets a delegation on an open file + * @fd: open file descriptor + * @filp: file pointer + * @deleg: delegation request from userland + * + * Call this fcntl to establish a delegation on the file. + * Note that you also need to call %F_SETSIG to + * receive a signal when the lease is broken. + */ +int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg) +{ + /* For now, no flags are supported */ + if (deleg->d_flags != 0 || deleg->__pad != 0) + return -EINVAL; + + if (deleg->d_type == F_UNLCK) + return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); + return do_fcntl_add_lease(fd, filp, FL_DELEG, deleg->d_type); } /** diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 208d108df2d7..54b824c05299 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -159,6 +159,8 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int, int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); +int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg); +int fcntl_getdeleg(struct file *filp, struct delegation *deleg); static inline bool lock_is_unlock(struct file_lock *fl) { @@ -278,6 +280,16 @@ static inline int fcntl_getlease(struct file *filp) return F_UNLCK; } +static inline int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg) +{ + return -EINVAL; +} + +static inline int fcntl_getdeleg(struct file *filp, struct delegation *deleg) +{ + return -EINVAL; +} + static inline bool lock_is_unlock(struct file_lock *fl) { return false; diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 3741ea1b73d8..008fac15e573 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -79,6 +79,17 @@ */ #define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET +/* Set/Get delegations */ +#define F_GETDELEG (F_LINUX_SPECIFIC_BASE + 15) +#define F_SETDELEG (F_LINUX_SPECIFIC_BASE + 16) + +/* Argument structure for F_GETDELEG and F_SETDELEG */ +struct delegation { + uint32_t d_flags; /* Must be 0 */ + uint16_t d_type; /* F_RDLCK, F_WRLCK, F_UNLCK */ + uint16_t __pad; /* Must be 0 */ +}; + /* * Types of directory notifications that may be requested. */ -- cgit v1.2.3 From a3f8f8662771285511ae26c4c8d3ba1cd22159b9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Nov 2025 14:39:45 +0100 Subject: power: always freeze efivarfs The efivarfs filesystems must always be frozen and thawed to resync variable state. Make it so. Link: https://patch.msgid.link/20251105-vorbild-zutreffen-fe00d1dd98db@brauner Signed-off-by: Christian Brauner --- fs/efivarfs/super.c | 1 + fs/super.c | 13 ++++++++++--- include/linux/fs.h | 3 ++- kernel/power/hibernate.c | 9 +++------ kernel/power/suspend.c | 3 +-- 5 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 1f4d8ce56667..6de97565d5f7 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -533,6 +533,7 @@ static struct file_system_type efivarfs_type = { .init_fs_context = efivarfs_init_fs_context, .kill_sb = efivarfs_kill_sb, .parameters = efivarfs_parameters, + .fs_flags = FS_POWER_FREEZE, }; static __init int efivarfs_init(void) diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..277b84e5c279 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1183,11 +1183,14 @@ static inline bool get_active_super(struct super_block *sb) static const char *filesystems_freeze_ptr = "filesystems_freeze"; -static void filesystems_freeze_callback(struct super_block *sb, void *unused) +static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all_ptr) { if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; + if (freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE)) + return; + if (!get_active_super(sb)) return; @@ -1201,9 +1204,13 @@ static void filesystems_freeze_callback(struct super_block *sb, void *unused) deactivate_super(sb); } -void filesystems_freeze(void) +void filesystems_freeze(bool freeze_all) { - __iterate_supers(filesystems_freeze_callback, NULL, + void *freeze_all_ptr = NULL; + + if (freeze_all) + freeze_all_ptr = &freeze_all; + __iterate_supers(filesystems_freeze_callback, freeze_all_ptr, SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 3ea98c6cce81..249a1da8440e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2689,6 +2689,7 @@ struct file_system_type { #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ +#define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -3606,7 +3607,7 @@ extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -void filesystems_freeze(void); +void filesystems_freeze(bool freeze_all); void filesystems_thaw(void); extern int dcache_dir_open(struct inode *, struct file *); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 14e85ff23551..1f250ce036a0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -825,8 +825,7 @@ int hibernate(void) goto Restore; ksys_sync_helper(); - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -932,8 +931,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -1083,8 +1081,7 @@ static int software_resume(void) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4bb4686c1c08..c933a63a9718 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -375,8 +375,7 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); -- cgit v1.2.3 From 12741624645e098b2234a5ae341045a97473caf1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Nov 2025 22:20:24 +0100 Subject: fs: add iput_not_last() Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251105212025.807549-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 12 ++++++++++++ include/linux/fs.h | 1 + 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/fs/inode.c b/fs/inode.c index ec9339024ac3..cff1d3af0d57 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1967,6 +1967,18 @@ retry: } EXPORT_SYMBOL(iput); +/** + * iput_not_last - put an inode assuming this is not the last reference + * @inode: inode to put + */ +void iput_not_last(struct inode *inode) +{ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode); + + WARN_ON(atomic_sub_return(1, &inode->i_count) == 0); +} +EXPORT_SYMBOL(iput_not_last); + #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file diff --git a/include/linux/fs.h b/include/linux/fs.h index 249a1da8440e..dd3b57cfadee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2824,6 +2824,7 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); +void iput_not_last(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); int generic_update_time(struct inode *, int); -- cgit v1.2.3 From 7e6cea5ae2f5e62112fce69acc07ee8b694b6dd0 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:52 -0800 Subject: docs: document iomap writeback's iomap_finish_folio_write() requirement Document that iomap_finish_folio_write() must be called after writeback on the range completes. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-4-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 3 +++ include/linux/iomap.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index c88205132039..4d30723be7fa 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -361,6 +361,9 @@ The fields are as follows: delalloc reservations to avoid having delalloc reservations for clean pagecache. This function must be supplied by the filesystem. + If this succeeds, iomap_finish_folio_write() must be called once writeback + completes for the range, regardless of whether the writeback succeeded or + failed. - ``writeback_submit``: Submit the previous built writeback context. Block based file systems should use the iomap_ioend_writeback_submit diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b1ac08c7474..a5032e456079 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -435,6 +435,10 @@ struct iomap_writeback_ops { * An existing mapping from a previous call to this method can be reused * by the file system if it is still valid. * + * If this succeeds, iomap_finish_folio_write() must be called once + * writeback completes for the range, regardless of whether the + * writeback succeeded or failed. + * * Returns the number of bytes processed or a negative errno. */ ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc, -- cgit v1.2.3 From 6b1fd2281fb0873ec56f8791d4e4898302070804 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:53 -0800 Subject: iomap: optimize pending async writeback accounting Pending writebacks must be accounted for to determine when all requests have completed and writeback on the folio should be ended. Currently this is done by atomically incrementing ifs->write_bytes_pending for every range to be written back. Instead, the number of atomic operations can be minimized by setting ifs->write_bytes_pending to the folio size, internally tracking how many bytes are written back asynchronously, and then after sending off all the requests, decrementing ifs->write_bytes_pending by the number of bytes not written back asynchronously. Now, for N ranges written back, only N + 2 atomic operations are required instead of 2N + 2. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-5-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/fuse/file.c | 4 ++-- fs/iomap/buffered-io.c | 58 +++++++++++++++++++++++++++++--------------------- fs/iomap/ioend.c | 2 -- include/linux/iomap.h | 2 -- 4 files changed, 36 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8275b6681b9b..b343a6f37563 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1885,7 +1885,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ - iomap_finish_folio_write(inode, ap->folios[i], 1); + iomap_finish_folio_write(inode, ap->folios[i], + ap->descs[i].length); wake_up(&fi->page_waitq); } @@ -2221,7 +2222,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, ap = &wpa->ia.ap; } - iomap_start_folio_write(inode, folio, 1); fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, offset, len); data->nr_bytes += len; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 0eb439b523b1..1873a2f74883 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1641,16 +1641,25 @@ out_unlock: } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len) +static void iomap_writeback_init(struct inode *inode, struct folio *folio) { struct iomap_folio_state *ifs = folio->private; WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); - if (ifs) - atomic_add(len, &ifs->write_bytes_pending); + if (ifs) { + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); + /* + * Set this to the folio size. After processing the folio for + * writeback in iomap_writeback_folio(), we'll subtract any + * ranges not written back. + * + * We do this because otherwise, we would have to atomically + * increment ifs->write_bytes_pending every time a range in the + * folio needs to be written back. + */ + atomic_set(&ifs->write_bytes_pending, folio_size(folio)); + } } -EXPORT_SYMBOL_GPL(iomap_start_folio_write); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len) @@ -1667,7 +1676,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_folio_write); static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 pos, u32 rlen, u64 end_pos, - bool *wb_pending) + size_t *bytes_submitted) { do { ssize_t ret; @@ -1681,11 +1690,11 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, pos += ret; /* - * Holes are not be written back by ->writeback_range, so track + * Holes are not written back by ->writeback_range, so track * if we did handle anything that is not a hole here. */ if (wpc->iomap.type != IOMAP_HOLE) - *wb_pending = true; + *bytes_submitted += ret; } while (rlen); return 0; @@ -1756,7 +1765,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); u64 end_aligned = 0; - bool wb_pending = false; + size_t bytes_submitted = 0; int error = 0; u32 rlen; @@ -1776,14 +1785,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) iomap_set_range_dirty(folio, 0, end_pos - pos); } - /* - * Keep the I/O completion handler from clearing the writeback - * bit until we have submitted all blocks by adding a bias to - * ifs->write_bytes_pending, which is dropped after submitting - * all blocks. - */ - WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); - iomap_start_folio_write(inode, folio, 1); + iomap_writeback_init(inode, folio); } /* @@ -1798,13 +1800,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) end_aligned = round_up(end_pos, i_blocksize(inode)); while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos, - &wb_pending); + &bytes_submitted); if (error) break; pos += rlen; } - if (wb_pending) + if (bytes_submitted) wpc->nr_folios++; /* @@ -1822,12 +1824,20 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) * bit ourselves right after unlocking the page. */ if (ifs) { - if (atomic_dec_and_test(&ifs->write_bytes_pending)) - folio_end_writeback(folio); - } else { - if (!wb_pending) - folio_end_writeback(folio); + /* + * Subtract any bytes that were initially accounted to + * write_bytes_pending but skipped for writeback. + */ + size_t bytes_not_submitted = folio_size(folio) - + bytes_submitted; + + if (bytes_not_submitted) + iomap_finish_folio_write(inode, folio, + bytes_not_submitted); + } else if (!bytes_submitted) { + folio_end_writeback(folio); } + mapping_set_error(inode->i_mapping, error); return error; } diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index b49fa75eab26..86f44922ed3b 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -194,8 +194,6 @@ new_ioend: if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) goto new_ioend; - iomap_start_folio_write(wpc->inode, folio, map_len); - /* * Clamp io_offset and io_size to the incore EOF so that ondisk * file size updates in the ioend completion are byte-accurate. diff --git a/include/linux/iomap.h b/include/linux/iomap.h index a5032e456079..b49e47f069db 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -478,8 +478,6 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len); -- cgit v1.2.3 From f8eaf79406fe9415db0e7a5c175b50cb01265199 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:54 -0800 Subject: iomap: simplify ->read_folio_range() error handling for reads Instead of requiring that the caller calls iomap_finish_folio_read() even if the ->read_folio_range() callback returns an error, account for this internally in iomap instead, which makes the interface simpler and makes it match writeback's ->read_folio_range() error handling expectations. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-6-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 7 ++- fs/fuse/file.c | 10 +--- fs/iomap/buffered-io.c | 63 ++++++++++++++------------ include/linux/iomap.h | 5 +- 4 files changed, 41 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 4d30723be7fa..64f4baf5750e 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -149,10 +149,9 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: iomap calls these functions: - ``read_folio_range``: Called to read in the range. This must be provided - by the caller. The caller is responsible for calling - iomap_finish_folio_read() after reading in the folio range. This should be - done even if an error is encountered during the read. This returns 0 on - success or a negative error on failure. + by the caller. If this succeeds, iomap_finish_folio_read() must be called + after the range is read in, regardless of whether the read succeeded or + failed. - ``submit_read``: Submit any pending read requests. This function is optional. diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b343a6f37563..7bcb650a9f26 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -922,13 +922,6 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, if (ctx->rac) { ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); - /* - * If fuse_handle_readahead was successful, fuse_readpages_end - * will do the iomap_finish_folio_read, else we need to call it - * here - */ - if (ret) - iomap_finish_folio_read(folio, off, len, ret); } else { /* * for non-readahead read requests, do reads synchronously @@ -936,7 +929,8 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, * out-of-order reads */ ret = fuse_do_readfolio(file, folio, off, len); - iomap_finish_folio_read(folio, off, len, ret); + if (!ret) + iomap_finish_folio_read(folio, off, len, ret); } return ret; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1873a2f74883..c82b5b24d4b3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -398,7 +398,8 @@ static void iomap_read_init(struct folio *folio) * has already finished reading in the entire folio. */ spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending += len + 1; + WARN_ON_ONCE(ifs->read_bytes_pending != 0); + ifs->read_bytes_pending = len + 1; spin_unlock_irq(&ifs->state_lock); } } @@ -414,43 +415,47 @@ static void iomap_read_init(struct folio *folio) */ static void iomap_read_end(struct folio *folio, size_t bytes_submitted) { - struct iomap_folio_state *ifs; - - /* - * If there are no bytes submitted, this means we are responsible for - * unlocking the folio here, since no IO helper has taken ownership of - * it. - */ - if (!bytes_submitted) { - folio_unlock(folio); - return; - } + struct iomap_folio_state *ifs = folio->private; - ifs = folio->private; if (ifs) { bool end_read, uptodate; - /* - * Subtract any bytes that were initially accounted to - * read_bytes_pending but skipped for IO. - * The +1 accounts for the bias we added in iomap_read_init(). - */ - size_t bytes_not_submitted = folio_size(folio) + 1 - - bytes_submitted; spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending -= bytes_not_submitted; - /* - * If !ifs->read_bytes_pending, this means all pending reads - * by the IO helper have already completed, which means we need - * to end the folio read here. If ifs->read_bytes_pending != 0, - * the IO helper will end the folio read. - */ - end_read = !ifs->read_bytes_pending; + if (!ifs->read_bytes_pending) { + WARN_ON_ONCE(bytes_submitted); + end_read = true; + } else { + /* + * Subtract any bytes that were initially accounted to + * read_bytes_pending but skipped for IO. The +1 + * accounts for the bias we added in iomap_read_init(). + */ + size_t bytes_not_submitted = folio_size(folio) + 1 - + bytes_submitted; + ifs->read_bytes_pending -= bytes_not_submitted; + /* + * If !ifs->read_bytes_pending, this means all pending + * reads by the IO helper have already completed, which + * means we need to end the folio read here. If + * ifs->read_bytes_pending != 0, the IO helper will end + * the folio read. + */ + end_read = !ifs->read_bytes_pending; + } if (end_read) uptodate = ifs_is_fully_uptodate(folio, ifs); spin_unlock_irq(&ifs->state_lock); if (end_read) folio_end_read(folio, uptodate); + } else if (!bytes_submitted) { + /* + * If there were no bytes submitted, this means we are + * responsible for unlocking the folio here, since no IO helper + * has taken ownership of it. If there were bytes submitted, + * then the IO helper will end the read via + * iomap_finish_folio_read(). + */ + folio_unlock(folio); } } @@ -498,10 +503,10 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, } else { if (!*bytes_submitted) iomap_read_init(folio); - *bytes_submitted += plen; ret = ctx->ops->read_folio_range(iter, ctx, plen); if (ret) return ret; + *bytes_submitted += plen; } ret = iomap_iter_advance(iter, plen); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b49e47f069db..520e967cb501 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -495,9 +495,8 @@ struct iomap_read_ops { /* * Read in a folio range. * - * The caller is responsible for calling iomap_finish_folio_read() after - * reading in the folio range. This should be done even if an error is - * encountered during the read. + * If this succeeds, iomap_finish_folio_read() must be called after the + * range is read in, regardless of whether the read succeeded or failed. * * Returns 0 on success or a negative error on failure. */ -- cgit v1.2.3 From 395b95530343e7f4bdd2870190d985a222997fb6 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 16 Sep 2025 14:53:07 +0100 Subject: dcache: export shrink_dentry_list() and add new helper d_dispose_if_unused() Add and export a new helper d_dispose_if_unused() which is simply a wrapper around to_shrink_list(), to add an entry to a dispose list if it's not used anymore. Also export shrink_dentry_list() to kill all dentries in a dispose list. Suggested-by: Miklos Szeredi Signed-off-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/dcache.c | 18 ++++++++++++------ include/linux/dcache.h | 2 ++ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 035cccbc9276..bffb1b47a907 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1086,6 +1086,15 @@ struct dentry *d_find_alias_rcu(struct inode *inode) return de; } +void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose) +{ + spin_lock(&dentry->d_lock); + if (!dentry->d_lockref.count) + to_shrink_list(dentry, dispose); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_dispose_if_unused); + /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. @@ -1096,12 +1105,8 @@ void d_prune_aliases(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { - spin_lock(&dentry->d_lock); - if (!dentry->d_lockref.count) - to_shrink_list(dentry, &dispose); - spin_unlock(&dentry->d_lock); - } + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) + d_dispose_if_unused(dentry, &dispose); spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } @@ -1141,6 +1146,7 @@ void shrink_dentry_list(struct list_head *list) shrink_kill(dentry); } } +EXPORT_SYMBOL(shrink_dentry_list); static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c83e02b94389..2bc1339bf6d0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -268,6 +268,8 @@ extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); +extern void d_dispose_if_unused(struct dentry *, struct list_head *); +extern void shrink_dentry_list(struct list_head *); extern struct dentry *d_find_alias_rcu(struct inode *); -- cgit v1.2.3 From 854e8df2ce6b02c8be40d6f26bd8aa700b375bb2 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 23 Oct 2025 10:21:42 +0200 Subject: fs/pipe: stop duplicating union pipe_index declaration Now that we build with -fms-extensions, union pipe_index can be included as an anonymous member in struct pipe_inode_info, avoiding the duplication. Signed-off-by: Rasmus Villemoes Link: https://patch.msgid.link/20251023082142.2104456-1-linux@rasmusvillemoes.dk Signed-off-by: Nathan Chancellor Signed-off-by: Christian Brauner --- include/linux/pipe_fs_i.h | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 9d42d473d201..7f6a92ac9704 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -44,11 +44,11 @@ typedef unsigned int pipe_index_t; typedef unsigned short pipe_index_t; #endif -/* - * We have to declare this outside 'struct pipe_inode_info', - * but then we can't use 'union pipe_index' for an anonymous - * union, so we end up having to duplicate this declaration - * below. Annoying. +/** + * struct pipe_index - pipe indeces + * @head: The point of buffer production + * @tail: The point of buffer consumption + * @head_tail: unsigned long union of @head and @tail */ union pipe_index { unsigned long head_tail; @@ -63,9 +63,7 @@ union pipe_index { * @mutex: mutex protecting the whole thing * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe - * @head: The point of buffer production - * @tail: The point of buffer consumption - * @head_tail: unsigned long union of @head and @tail + * @pipe_index: the pipe indeces * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) @@ -87,14 +85,7 @@ struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; - /* This has to match the 'union pipe_index' above */ - union { - unsigned long head_tail; - struct { - pipe_index_t head; - pipe_index_t tail; - }; - }; + union pipe_index; unsigned int max_usage; unsigned int ring_size; -- cgit v1.2.3 From e631df89cd5d638a9d7c152dd9b0a92643efab3e Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 7 Nov 2025 15:21:47 +0100 Subject: fs: speed up path lookup with cheaper handling of MAY_EXEC The generic inode_permission() routine does work which is known to be of no significance for lookup. There are checks for MAY_WRITE, while the requested permission is MAY_EXEC. Additionally devcgroup_inode_permission() is called to check for devices, but it is an invariant the inode is a directory. Absent a ->permission func, execution lands in generic_permission() which checks upfront if the requested permission is granted for everyone. We can elide the branches which are guaranteed to be false and cut straight to the check if everyone happens to be allowed MAY_EXEC on the inode (which holds true most of the time). Moreover, filesystems which provide their own ->permission routine can take advantage of the optimization by setting the IOP_FASTPERM_MAY_EXEC flag on their inodes, which they can legitimately do if their MAY_EXEC handling matches generic_permission(). As a simple benchmark, as part of compilation gcc issues access(2) on numerous long paths, for example /usr/lib/gcc/x86_64-linux-gnu/12/crtendS.o Issuing access(2) on it in a loop on ext4 on Sapphire Rapids (ops/s): before: 3797556 after: 3987789 (+5%) Note: this depends on the not-yet-landed ext4 patch to mark inodes with cache_no_acl() Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251107142149.989998-2-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namei.c | 43 +++++++++++++++++++++++++++++++++++++++++-- include/linux/fs.h | 13 +++++++------ 2 files changed, 48 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 1d4d17f24fb2..94cb52b01022 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -540,6 +540,9 @@ static inline int do_inode_permission(struct mnt_idmap *idmap, * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. + * + * Note: lookup_inode_permission_may_exec() does not call here. If you add + * MAY_EXEC checks, adjust it. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { @@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap, } EXPORT_SYMBOL(inode_permission); +/* + * lookup_inode_permission_may_exec - Check traversal right for given inode + * + * This is a special case routine for may_lookup() making assumptions specific + * to path traversal. Use inode_permission() if you are doing something else. + * + * Work is shaved off compared to inode_permission() as follows: + * - we know for a fact there is no MAY_WRITE to worry about + * - it is an invariant the inode is a directory + * + * Since majority of real-world traversal happens on inodes which grant it for + * everyone, we check it upfront and only resort to more expensive work if it + * fails. + * + * Filesystems which have their own ->permission hook and consequently miss out + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC + * on their directory inodes. + */ +static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + /* Lookup already checked this to return -ENOTDIR */ + VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode); + VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0); + + mask |= MAY_EXEC; + + if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC)))) + return inode_permission(idmap, inode, mask); + + if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode))) + return inode_permission(idmap, inode, mask); + + return security_inode_permission(inode, mask); +} + /** * path_get - get a reference to a path * @path: path to get the reference to @@ -1855,7 +1894,7 @@ static inline int may_lookup(struct mnt_idmap *idmap, int err, mask; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; - err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); + err = lookup_inode_permission_may_exec(idmap, nd->inode, mask); if (likely(!err)) return 0; @@ -1870,7 +1909,7 @@ static inline int may_lookup(struct mnt_idmap *idmap, if (err != -ECHILD) // hard error return err; - return inode_permission(idmap, nd->inode, MAY_EXEC); + return lookup_inode_permission_may_exec(idmap, nd->inode, 0); } static int reserve_stack(struct nameidata *nd, struct path *link) diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..ff69734b9fde 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -659,13 +659,14 @@ is_uncached_acl(struct posix_acl *acl) return (long)acl & 1; } -#define IOP_FASTPERM 0x0001 -#define IOP_LOOKUP 0x0002 -#define IOP_NOFOLLOW 0x0004 -#define IOP_XATTR 0x0008 +#define IOP_FASTPERM 0x0001 +#define IOP_LOOKUP 0x0002 +#define IOP_NOFOLLOW 0x0004 +#define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 -#define IOP_MGTIME 0x0020 -#define IOP_CACHED_LINK 0x0040 +#define IOP_MGTIME 0x0020 +#define IOP_CACHED_LINK 0x0040 +#define IOP_FASTPERM_MAY_EXEC 0x0080 /* * Inode state bits. Protected by inode->i_lock -- cgit v1.2.3 From f99eb098090e4c8bfca4190b545e20450fee8250 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:12 +0100 Subject: platform/x86: asus-armoury: move existing tunings to asus-armoury module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fw_attributes_class provides a much cleaner interface to all of the attributes introduced to asus-wmi. This patch moves all of these extra attributes over to fw_attributes_class, and shifts the bulk of these definitions to a new kernel module to reduce the clutter of asus-wmi with the intention of deprecating the asus-wmi attributes in future. The work applies only to WMI methods which don't have a clearly defined place within the sysfs and as a result ended up lumped together in /sys/devices/platform/asus-nb-wmi/ with no standard API. Where possible the fw attrs now implement defaults, min, max, scalar, choices, etc. As en example dgpu_disable becomes: /sys/class/firmware-attributes/asus-armoury/attributes/dgpu_disable/ ├── current_value ├── display_name ├── possible_values └── type as do other attributes. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Link: https://patch.msgid.link/20251102215319.3126879-3-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/hid/hid-asus.c | 1 + drivers/platform/x86/Kconfig | 12 + drivers/platform/x86/Makefile | 1 + drivers/platform/x86/asus-armoury.c | 763 +++++++++++++++++++++ drivers/platform/x86/asus-armoury.h | 200 ++++++ drivers/platform/x86/asus-wmi.c | 10 +- .../linux/platform_data/x86/asus-wmi-leds-ids.h | 50 ++ include/linux/platform_data/x86/asus-wmi.h | 44 +- 8 files changed, 1034 insertions(+), 47 deletions(-) create mode 100644 drivers/platform/x86/asus-armoury.c create mode 100644 drivers/platform/x86/asus-armoury.h create mode 100644 include/linux/platform_data/x86/asus-wmi-leds-ids.h (limited to 'include/linux') diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index a444d41e53b6..472bca54642b 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include /* For to_usb_interface for T100 touchpad intf check */ #include diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 1e9b84f1098f..ba0806b48bb9 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -264,6 +264,18 @@ config ASUS_WIRELESS If you choose to compile this driver as a module the module will be called asus-wireless. +config ASUS_ARMOURY + tristate "ASUS Armoury driver" + depends on ASUS_WMI + select FW_ATTR_CLASS + help + Say Y here if you have a WMI aware Asus machine and would like to use the + firmware_attributes API to control various settings typically exposed in + the ASUS Armoury Crate application available on Windows. + + To compile this driver as a module, choose M here: the module will + be called asus-armoury. + config ASUS_WMI tristate "ASUS WMI Driver" depends on ACPI_WMI diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index d722e244a4a7..d59a2ed5932c 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_APPLE_GMUX) += apple-gmux.o # ASUS obj-$(CONFIG_ASUS_LAPTOP) += asus-laptop.o obj-$(CONFIG_ASUS_WIRELESS) += asus-wireless.o +obj-$(CONFIG_ASUS_ARMOURY) += asus-armoury.o obj-$(CONFIG_ASUS_WMI) += asus-wmi.o obj-$(CONFIG_ASUS_NB_WMI) += asus-nb-wmi.o obj-$(CONFIG_ASUS_TF103C_DOCK) += asus-tf103c-dock.o diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c new file mode 100644 index 000000000000..81b4972df818 --- /dev/null +++ b/drivers/platform/x86/asus-armoury.c @@ -0,0 +1,763 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Asus Armoury (WMI) attributes driver. + * + * This driver uses the fw_attributes class to expose various WMI functions + * that are present in many gaming and some non-gaming ASUS laptops. + * + * These typically don't fit anywhere else in the sysfs such as under LED class, + * hwmon or others, and are set in Windows using the ASUS Armoury Crate tool. + * + * Copyright(C) 2024 Luke Jones + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "asus-armoury.h" +#include "firmware_attributes_class.h" + +#define ASUS_NB_WMI_EVENT_GUID "0B3CBB35-E3C2-45ED-91C2-4C5A6D195D1C" + +#define ASUS_MINI_LED_MODE_MASK GENMASK(1, 0) +/* Standard modes for devices with only on/off */ +#define ASUS_MINI_LED_OFF 0x00 +#define ASUS_MINI_LED_ON 0x01 +/* Like "on" but the effect is more vibrant or brighter */ +#define ASUS_MINI_LED_STRONG_MODE 0x02 +/* New modes for devices with 3 mini-led mode types */ +#define ASUS_MINI_LED_2024_WEAK 0x00 +#define ASUS_MINI_LED_2024_STRONG 0x01 +#define ASUS_MINI_LED_2024_OFF 0x02 + +struct asus_armoury_priv { + struct device *fw_attr_dev; + struct kset *fw_attr_kset; + + /* + * Mutex to protect eGPU activation/deactivation + * sequences and dGPU connection status: + * do not allow concurrent changes or changes + * before a reboot if dGPU got disabled. + */ + struct mutex egpu_mutex; + + u32 mini_led_dev_id; + u32 gpu_mux_dev_id; +}; + +static struct asus_armoury_priv asus_armoury = { + .egpu_mutex = __MUTEX_INITIALIZER(asus_armoury.egpu_mutex), +}; + +struct fw_attrs_group { + bool pending_reboot; +}; + +static struct fw_attrs_group fw_attrs = { + .pending_reboot = false, +}; + +struct asus_attr_group { + const struct attribute_group *attr_group; + u32 wmi_devid; +}; + +static void asus_set_reboot_and_signal_event(void) +{ + fw_attrs.pending_reboot = true; + kobject_uevent(&asus_armoury.fw_attr_dev->kobj, KOBJ_CHANGE); +} + +static ssize_t pending_reboot_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", fw_attrs.pending_reboot); +} + +static struct kobj_attribute pending_reboot = __ATTR_RO(pending_reboot); + +static bool asus_bios_requires_reboot(struct kobj_attribute *attr) +{ + return !strcmp(attr->attr.name, "gpu_mux_mode"); +} + +/** + * armoury_has_devstate() - Check presence of the WMI function state. + * + * @dev_id: The WMI method ID to check for presence. + * + * Returns: true iif method is supported. + */ +static bool armoury_has_devstate(u32 dev_id) +{ + u32 retval; + int status; + + status = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, dev_id, 0, &retval); + pr_debug("%s called (0x%08x), retval: 0x%08x\n", __func__, dev_id, retval); + + return status == 0 && (retval & ASUS_WMI_DSTS_PRESENCE_BIT); +} + +/** + * armoury_get_devstate() - Get the WMI function state. + * @attr: NULL or the kobj_attribute associated to called WMI function. + * @dev_id: The WMI method ID to call. + * @retval: + * * non-NULL pointer to where to store the value returned from WMI + * * with the function presence bit cleared. + * + * Intended usage is from sysfs attribute checking associated WMI function. + * + * Returns: + * * %-ENODEV - method ID is unsupported. + * * %0 - successful and retval is filled. + * * %other - error from WMI call. + */ +static int armoury_get_devstate(struct kobj_attribute *attr, u32 *retval, u32 dev_id) +{ + int err; + + err = asus_wmi_get_devstate_dsts(dev_id, retval); + if (err) { + if (attr) + pr_err("Failed to get %s: %d\n", attr->attr.name, err); + else + pr_err("Failed to get devstate for 0x%x: %d\n", dev_id, err); + + return err; + } + + /* + * asus_wmi_get_devstate_dsts will populate retval with WMI return, but + * the true value is expressed when ASUS_WMI_DSTS_PRESENCE_BIT is clear. + */ + *retval &= ~ASUS_WMI_DSTS_PRESENCE_BIT; + + return 0; +} + +/** + * armoury_set_devstate() - Set the WMI function state. + * @attr: The kobj_attribute associated to called WMI function. + * @dev_id: The WMI method ID to call. + * @value: The new value to be set. + * @retval: Where to store the value returned from WMI or NULL. + * + * Intended usage is from sysfs attribute setting associated WMI function. + * Before calling the presence of the function should be checked. + * + * Every WMI write MUST go through this function to enforce safety checks. + * + * Results !1 is usually considered a fail by ASUS, but some WMI methods + * (like eGPU or CPU cores) do use > 1 to return a status code or similar: + * in these cases caller is interested in the actual return value + * and should perform relevant checks. + * + * Returns: + * * %-EIO - WMI function returned an error. + * * %0 - successful and retval is filled. + * * %other - error from WMI call. + */ +static int armoury_set_devstate(struct kobj_attribute *attr, + u32 value, u32 *retval, u32 dev_id) +{ + u32 result; + int err; + + err = asus_wmi_set_devstate(dev_id, value, retval ? retval : &result); + if (err) { + if (attr) + pr_err("Failed to set %s: %d\n", attr->attr.name, err); + else + pr_err("Failed to set devstate for 0x%x: %d\n", dev_id, err); + + return err; + } + + /* + * If retval == NULL caller is uninterested in return value: + * perform the most common result check here. + */ + if ((retval == NULL) && (result == 0)) { + pr_err("Failed to set %s: (result): 0x%x\n", attr->attr.name, result); + return -EIO; + } + + return 0; +} + +static int armoury_attr_enum_list(char *buf, size_t enum_values) +{ + size_t i; + int len = 0; + + for (i = 0; i < enum_values; i++) { + if (i == 0) + len += sysfs_emit_at(buf, len, "%zu", i); + else + len += sysfs_emit_at(buf, len, ";%zu", i); + } + len += sysfs_emit_at(buf, len, "\n"); + + return len; +} + +ssize_t armoury_attr_uint_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count, u32 min, u32 max, + u32 *store_value, u32 wmi_dev) +{ + u32 value; + int err; + + err = kstrtou32(buf, 10, &value); + if (err) + return err; + + if (value < min || value > max) + return -EINVAL; + + err = armoury_set_devstate(attr, value, NULL, wmi_dev); + if (err) + return err; + + if (store_value != NULL) + *store_value = value; + sysfs_notify(kobj, NULL, attr->attr.name); + + if (asus_bios_requires_reboot(attr)) + asus_set_reboot_and_signal_event(); + + return count; +} + +ssize_t armoury_attr_uint_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf, u32 wmi_dev) +{ + u32 result; + int err; + + err = armoury_get_devstate(attr, &result, wmi_dev); + if (err) + return err; + + return sysfs_emit(buf, "%u\n", result); +} + +static ssize_t enum_type_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "enumeration\n"); +} + +/* Mini-LED mode **************************************************************/ + +/* Values map for mini-led modes on 2023 and earlier models. */ +static u32 mini_led_mode1_map[] = { + [0] = ASUS_MINI_LED_OFF, + [1] = ASUS_MINI_LED_ON, +}; + +/* Values map for mini-led modes on 2024 and later models. */ +static u32 mini_led_mode2_map[] = { + [0] = ASUS_MINI_LED_2024_OFF, + [1] = ASUS_MINI_LED_2024_WEAK, + [2] = ASUS_MINI_LED_2024_STRONG, +}; + +static ssize_t mini_led_mode_current_value_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + u32 *mini_led_mode_map; + size_t mini_led_mode_map_size; + u32 i, mode; + int err; + + switch (asus_armoury.mini_led_dev_id) { + case ASUS_WMI_DEVID_MINI_LED_MODE: + mini_led_mode_map = mini_led_mode1_map; + mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode1_map); + break; + + case ASUS_WMI_DEVID_MINI_LED_MODE2: + mini_led_mode_map = mini_led_mode2_map; + mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode2_map); + break; + + default: + pr_err("Unrecognized mini-LED device: %u\n", asus_armoury.mini_led_dev_id); + return -ENODEV; + } + + err = armoury_get_devstate(attr, &mode, asus_armoury.mini_led_dev_id); + if (err) + return err; + + mode = FIELD_GET(ASUS_MINI_LED_MODE_MASK, 0); + + for (i = 0; i < mini_led_mode_map_size; i++) + return sysfs_emit(buf, "%u\n", mini_led_mode_map[i]); + + pr_warn("Unrecognized mini-LED mode: %u", mode); + return -EINVAL; +} + +static ssize_t mini_led_mode_current_value_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + u32 *mini_led_mode_map; + size_t mini_led_mode_map_size; + u32 mode; + int err; + + err = kstrtou32(buf, 10, &mode); + if (err) + return err; + + switch (asus_armoury.mini_led_dev_id) { + case ASUS_WMI_DEVID_MINI_LED_MODE: + mini_led_mode_map = mini_led_mode1_map; + mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode1_map); + break; + + case ASUS_WMI_DEVID_MINI_LED_MODE2: + mini_led_mode_map = mini_led_mode2_map; + mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode2_map); + break; + + default: + pr_err("Unrecognized mini-LED devid: %u\n", asus_armoury.mini_led_dev_id); + return -EINVAL; + } + + if (mode >= mini_led_mode_map_size) { + return pr_warn("mini-LED mode unrecognized device: %u\n", mode); + return -ENODEV; + } + + return armoury_attr_uint_store(kobj, attr, buf, count, + 0, mini_led_mode_map[mode], + NULL, asus_armoury.mini_led_dev_id); +} + +static ssize_t mini_led_mode_possible_values_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + switch (asus_armoury.mini_led_dev_id) { + case ASUS_WMI_DEVID_MINI_LED_MODE: + return armoury_attr_enum_list(buf, ARRAY_SIZE(mini_led_mode1_map)); + case ASUS_WMI_DEVID_MINI_LED_MODE2: + return armoury_attr_enum_list(buf, ARRAY_SIZE(mini_led_mode2_map)); + default: + return -ENODEV; + } +} +ASUS_ATTR_GROUP_ENUM(mini_led_mode, "mini_led_mode", "Set the mini-LED backlight mode"); + +static ssize_t gpu_mux_mode_current_value_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int result, err; + bool optimus; + + err = kstrtobool(buf, &optimus); + if (err) + return err; + + if (armoury_has_devstate(ASUS_WMI_DEVID_DGPU)) { + err = armoury_get_devstate(NULL, &result, ASUS_WMI_DEVID_DGPU); + if (err) + return err; + if (result && !optimus) { + pr_warn("Cannot switch MUX to dGPU mode when dGPU is disabled: %02X\n", + result); + return -ENODEV; + } + } + + if (armoury_has_devstate(ASUS_WMI_DEVID_EGPU)) { + err = armoury_get_devstate(NULL, &result, ASUS_WMI_DEVID_EGPU); + if (err) + return err; + if (result && !optimus) { + pr_warn("Cannot switch MUX to dGPU mode when eGPU is enabled\n"); + return -EBUSY; + } + } + + err = armoury_set_devstate(attr, optimus ? 1 : 0, NULL, asus_armoury.gpu_mux_dev_id); + if (err) + return err; + + sysfs_notify(kobj, NULL, attr->attr.name); + asus_set_reboot_and_signal_event(); + + return count; +} +ASUS_WMI_SHOW_INT(gpu_mux_mode_current_value, asus_armoury.gpu_mux_dev_id); +ASUS_ATTR_GROUP_BOOL(gpu_mux_mode, "gpu_mux_mode", "Set the GPU display MUX mode"); + +static ssize_t dgpu_disable_current_value_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + int result, err; + bool disable; + + err = kstrtobool(buf, &disable); + if (err) + return err; + + if (asus_armoury.gpu_mux_dev_id) { + err = armoury_get_devstate(NULL, &result, asus_armoury.gpu_mux_dev_id); + if (err) + return err; + if (!result && disable) { + pr_warn("Cannot disable dGPU when the MUX is in dGPU mode\n"); + return -EBUSY; + } + } + + scoped_guard(mutex, &asus_armoury.egpu_mutex) { + err = armoury_set_devstate(attr, disable ? 1 : 0, NULL, ASUS_WMI_DEVID_DGPU); + if (err) + return err; + } + + sysfs_notify(kobj, NULL, attr->attr.name); + + return count; +} +ASUS_WMI_SHOW_INT(dgpu_disable_current_value, ASUS_WMI_DEVID_DGPU); +ASUS_ATTR_GROUP_BOOL(dgpu_disable, "dgpu_disable", "Disable the dGPU"); + +/* Values map for eGPU activation requests. */ +static u32 egpu_status_map[] = { + [0] = 0x00000000U, + [1] = 0x00000001U, + [2] = 0x00000101U, + [3] = 0x00000201U, +}; + +/* + * armoury_pci_rescan() - Performs a PCI rescan + * + * Bring up any GPU that has been hotplugged in the system. + */ +static void armoury_pci_rescan(void) +{ + struct pci_bus *b = NULL; + + pci_lock_rescan_remove(); + while ((b = pci_find_next_bus(b)) != NULL) + pci_rescan_bus(b); + pci_unlock_rescan_remove(); +} + +/* + * The ACPI call to enable the eGPU might also disable the internal dGPU, + * but this is not always the case and on certain models enabling the eGPU + * when the dGPU is either still active or has been disabled without rebooting + * will make both GPUs malfunction and the kernel will detect many + * PCI AER unrecoverable errors. + */ +static ssize_t egpu_enable_current_value_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + u32 requested, enable, result; + + err = kstrtou32(buf, 10, &requested); + if (err) + return err; + + if (requested >= ARRAY_SIZE(egpu_status_map)) + return -EINVAL; + enable = egpu_status_map[requested]; + + scoped_guard(mutex, &asus_armoury.egpu_mutex) { + /* Ensure the eGPU is connected before attempting to activate it. */ + if (enable) { + err = armoury_get_devstate(NULL, &result, ASUS_WMI_DEVID_EGPU_CONNECTED); + if (err) { + pr_warn("Failed to get eGPU connection status: %d\n", err); + return err; + } + if (!result) { + pr_warn("Cannot activate eGPU while undetected\n"); + return -ENOENT; + } + } + + if (asus_armoury.gpu_mux_dev_id) { + err = armoury_get_devstate(NULL, &result, asus_armoury.gpu_mux_dev_id); + if (err) + return err; + + if (!result && enable) { + pr_warn("Cannot enable eGPU when the MUX is in dGPU mode\n"); + return -ENODEV; + } + } + + err = armoury_set_devstate(attr, enable, &result, ASUS_WMI_DEVID_EGPU); + if (err) { + pr_err("Failed to set %s: %d\n", attr->attr.name, err); + return err; + } + + /* + * ACPI returns value 0x01 on success and 0x02 on a partial activation: + * performing a pci rescan will bring up the device in pci-e 3.0 speed, + * after a reboot the device will work at full speed. + */ + switch (result) { + case 0x01: + /* + * When a GPU is in use it does not get disconnected even if + * the ACPI call returns a success. + */ + if (!enable) { + err = armoury_get_devstate(attr, &result, ASUS_WMI_DEVID_EGPU); + if (err) { + pr_warn("Failed to ensure eGPU is deactivated: %d\n", err); + return err; + } + + if (result != 0) + return -EBUSY; + } + + pr_debug("Success changing the eGPU status\n"); + break; + case 0x02: + pr_info("Success changing the eGPU status, a reboot is strongly advised\n"); + asus_set_reboot_and_signal_event(); + break; + default: + pr_err("Failed to change the eGPU status: wmi result is 0x%x\n", result); + return -EIO; + } + } + + /* + * Perform a PCI rescan: on every tested model this is necessary + * to make the eGPU visible on the bus without rebooting. + */ + armoury_pci_rescan(); + + sysfs_notify(kobj, NULL, attr->attr.name); + + return count; +} + +static ssize_t egpu_enable_current_value_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int i, err; + u32 status; + + scoped_guard(mutex, &asus_armoury.egpu_mutex) { + err = armoury_get_devstate(attr, &status, ASUS_WMI_DEVID_EGPU); + if (err) + return err; + } + + for (i = 0; i < ARRAY_SIZE(egpu_status_map); i++) { + if (egpu_status_map[i] == status) + return sysfs_emit(buf, "%u\n", i); + } + + return -EIO; +} + +static ssize_t egpu_enable_possible_values_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return armoury_attr_enum_list(buf, ARRAY_SIZE(egpu_status_map)); +} +ASUS_ATTR_GROUP_ENUM(egpu_enable, "egpu_enable", "Enable the eGPU (also disables dGPU)"); + +/* Simple attribute creation */ +ASUS_ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2\n", + "Show the current mode of charging"); +ASUS_ATTR_GROUP_BOOL_RW(boot_sound, "boot_sound", ASUS_WMI_DEVID_BOOT_SOUND, + "Set the boot POST sound"); +ASUS_ATTR_GROUP_BOOL_RW(mcu_powersave, "mcu_powersave", ASUS_WMI_DEVID_MCU_POWERSAVE, + "Set MCU powersaving mode"); +ASUS_ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD, + "Set the panel refresh overdrive"); +ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED, + "Show the eGPU connection status"); + +/* If an attribute does not require any special case handling add it here */ +static const struct asus_attr_group armoury_attr_groups[] = { + { &egpu_connected_attr_group, ASUS_WMI_DEVID_EGPU_CONNECTED }, + { &egpu_enable_attr_group, ASUS_WMI_DEVID_EGPU }, + { &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU }, + + { &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE }, + { &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND }, + { &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE }, + { &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD }, +}; + +static int asus_fw_attr_add(void) +{ + int err, i; + + asus_armoury.fw_attr_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0), + NULL, "%s", DRIVER_NAME); + if (IS_ERR(asus_armoury.fw_attr_dev)) { + err = PTR_ERR(asus_armoury.fw_attr_dev); + goto fail_class_get; + } + + asus_armoury.fw_attr_kset = kset_create_and_add("attributes", NULL, + &asus_armoury.fw_attr_dev->kobj); + if (!asus_armoury.fw_attr_kset) { + err = -ENOMEM; + goto err_destroy_classdev; + } + + err = sysfs_create_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); + if (err) { + pr_err("Failed to create sysfs level attributes\n"); + goto err_destroy_kset; + } + + asus_armoury.mini_led_dev_id = 0; + if (armoury_has_devstate(ASUS_WMI_DEVID_MINI_LED_MODE)) + asus_armoury.mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE; + else if (armoury_has_devstate(ASUS_WMI_DEVID_MINI_LED_MODE2)) + asus_armoury.mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE2; + + if (asus_armoury.mini_led_dev_id) { + err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, + &mini_led_mode_attr_group); + if (err) { + pr_err("Failed to create sysfs-group for mini_led\n"); + goto err_remove_file; + } + } + + asus_armoury.gpu_mux_dev_id = 0; + if (armoury_has_devstate(ASUS_WMI_DEVID_GPU_MUX)) + asus_armoury.gpu_mux_dev_id = ASUS_WMI_DEVID_GPU_MUX; + else if (armoury_has_devstate(ASUS_WMI_DEVID_GPU_MUX_VIVO)) + asus_armoury.gpu_mux_dev_id = ASUS_WMI_DEVID_GPU_MUX_VIVO; + + if (asus_armoury.gpu_mux_dev_id) { + err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, + &gpu_mux_mode_attr_group); + if (err) { + pr_err("Failed to create sysfs-group for gpu_mux\n"); + goto err_remove_mini_led_group; + } + } + + for (i = 0; i < ARRAY_SIZE(armoury_attr_groups); i++) { + if (!armoury_has_devstate(armoury_attr_groups[i].wmi_devid)) + continue; + + err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, + armoury_attr_groups[i].attr_group); + if (err) { + pr_err("Failed to create sysfs-group for %s\n", + armoury_attr_groups[i].attr_group->name); + goto err_remove_groups; + } + } + + return 0; + +err_remove_groups: + while (i--) { + if (armoury_has_devstate(armoury_attr_groups[i].wmi_devid)) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, + armoury_attr_groups[i].attr_group); + } + if (asus_armoury.gpu_mux_dev_id) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &gpu_mux_mode_attr_group); +err_remove_mini_led_group: + if (asus_armoury.mini_led_dev_id) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &mini_led_mode_attr_group); +err_remove_file: + sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); +err_destroy_kset: + kset_unregister(asus_armoury.fw_attr_kset); +err_destroy_classdev: +fail_class_get: + device_destroy(&firmware_attributes_class, MKDEV(0, 0)); + return err; +} + +/* Init / exit ****************************************************************/ + +static int __init asus_fw_init(void) +{ + char *wmi_uid; + + wmi_uid = wmi_get_acpi_device_uid(ASUS_WMI_MGMT_GUID); + if (!wmi_uid) + return -ENODEV; + + /* + * if equal to "ASUSWMI" then it's DCTS that can't be used for this + * driver, DSTS is required. + */ + if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI)) + return -ENODEV; + + return asus_fw_attr_add(); +} + +static void __exit asus_fw_exit(void) +{ + int i; + + for (i = ARRAY_SIZE(armoury_attr_groups) - 1; i >= 0; i--) { + if (armoury_has_devstate(armoury_attr_groups[i].wmi_devid)) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, + armoury_attr_groups[i].attr_group); + } + + if (asus_armoury.gpu_mux_dev_id) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &gpu_mux_mode_attr_group); + + if (asus_armoury.mini_led_dev_id) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &mini_led_mode_attr_group); + + sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); + kset_unregister(asus_armoury.fw_attr_kset); + device_destroy(&firmware_attributes_class, MKDEV(0, 0)); +} + +module_init(asus_fw_init); +module_exit(asus_fw_exit); + +MODULE_IMPORT_NS("ASUS_WMI"); +MODULE_AUTHOR("Luke Jones "); +MODULE_DESCRIPTION("ASUS BIOS Configuration Driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("wmi:" ASUS_NB_WMI_EVENT_GUID); diff --git a/drivers/platform/x86/asus-armoury.h b/drivers/platform/x86/asus-armoury.h new file mode 100644 index 000000000000..3a2a674a1b55 --- /dev/null +++ b/drivers/platform/x86/asus-armoury.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Definitions for kernel modules using asus-armoury driver + * + * Copyright (c) 2024 Luke Jones + */ + +#ifndef _ASUS_ARMOURY_H_ +#define _ASUS_ARMOURY_H_ + +#include +#include +#include + +#define DRIVER_NAME "asus-armoury" + +/** + * armoury_attr_uint_store() - Send an uint to WMI method if within min/max. + * @kobj: Pointer to the driver object. + * @attr: Pointer to the attribute calling this function. + * @buf: The buffer to read from, this is parsed to `uint` type. + * @count: Required by sysfs attribute macros, pass in from the callee attr. + * @min: Minimum accepted value. Below this returns -EINVAL. + * @max: Maximum accepted value. Above this returns -EINVAL. + * @store_value: Pointer to where the parsed value should be stored. + * @wmi_dev: The WMI function ID to use. + * + * This function is intended to be generic so it can be called from any "_store" + * attribute which works only with integers. + * + * Integers to be sent to the WMI method is inclusive range checked and + * an error returned if out of range. + * + * If the value is valid and WMI is success then the sysfs attribute is notified + * and if asus_bios_requires_reboot() is true then reboot attribute + * is also notified. + * + * Returns: Either count, or an error. + */ +ssize_t armoury_attr_uint_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count, u32 min, u32 max, + u32 *store_value, u32 wmi_dev); + +/** + * armoury_attr_uint_show() - Receive an uint from a WMI method. + * @kobj: Pointer to the driver object. + * @attr: Pointer to the attribute calling this function. + * @buf: The buffer to write to, as an `uint` type. + * @wmi_dev: The WMI function ID to use. + * + * This function is intended to be generic so it can be called from any "_show" + * attribute which works only with integers. + * + * Returns: Either count, or an error. + */ +ssize_t armoury_attr_uint_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf, u32 wmi_dev); + +#define __ASUS_ATTR_RO(_func, _name) \ + { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = _func##_##_name##_show, \ + } + +#define __ASUS_ATTR_RO_AS(_name, _show) \ + { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = _show, \ + } + +#define __ASUS_ATTR_RW(_func, _name) \ + __ATTR(_name, 0644, _func##_##_name##_show, _func##_##_name##_store) + +#define __WMI_STORE_INT(_attr, _min, _max, _wmi) \ + static ssize_t _attr##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t count) \ + { \ + return armoury_attr_uint_store(kobj, attr, buf, count, _min, \ + _max, NULL, _wmi); \ + } + +#define ASUS_WMI_SHOW_INT(_attr, _wmi) \ + static ssize_t _attr##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return armoury_attr_uint_show(kobj, attr, buf, _wmi); \ + } + +/* Create functions and attributes for use in other macros or on their own */ + +/* Shows a formatted static variable */ +#define __ATTR_SHOW_FMT(_prop, _attrname, _fmt, _val) \ + static ssize_t _attrname##_##_prop##_show( \ + struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ + { \ + return sysfs_emit(buf, _fmt, _val); \ + } \ + static struct kobj_attribute attr_##_attrname##_##_prop = \ + __ASUS_ATTR_RO(_attrname, _prop) + +#define __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, _possible, _dispname)\ + ASUS_WMI_SHOW_INT(_attrname##_current_value, _wmi); \ + static struct kobj_attribute attr_##_attrname##_current_value = \ + __ASUS_ATTR_RO(_attrname, current_value); \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + __ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, enum_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_possible_values.attr, \ + &attr_##_attrname##_type.attr, \ + NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +#define __ATTR_RW_INT_GROUP_ENUM(_attrname, _minv, _maxv, _wmi, _fsname,\ + _possible, _dispname) \ + __WMI_STORE_INT(_attrname##_current_value, _minv, _maxv, _wmi); \ + ASUS_WMI_SHOW_INT(_attrname##_current_value, _wmi); \ + static struct kobj_attribute attr_##_attrname##_current_value = \ + __ASUS_ATTR_RW(_attrname, current_value); \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + __ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, enum_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_possible_values.attr, \ + &attr_##_attrname##_type.attr, \ + NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +/* Boolean style enumeration, base macro. Requires adding show/store */ +#define __ATTR_GROUP_ENUM(_attrname, _fsname, _possible, _dispname) \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + __ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, enum_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_possible_values.attr, \ + &attr_##_attrname##_type.attr, \ + NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +#define ASUS_ATTR_GROUP_BOOL_RO(_attrname, _fsname, _wmi, _dispname) \ + __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, "0;1", _dispname) + + +#define ASUS_ATTR_GROUP_BOOL_RW(_attrname, _fsname, _wmi, _dispname) \ + __ATTR_RW_INT_GROUP_ENUM(_attrname, 0, 1, _wmi, _fsname, "0;1", _dispname) + +#define ASUS_ATTR_GROUP_ENUM_INT_RO(_attrname, _fsname, _wmi, _possible, _dispname) \ + __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, _possible, _dispname) + +/* + * Requires _current_value_show(), _current_value_show() + */ +#define ASUS_ATTR_GROUP_BOOL(_attrname, _fsname, _dispname) \ + static struct kobj_attribute attr_##_attrname##_current_value = \ + __ASUS_ATTR_RW(_attrname, current_value); \ + __ATTR_GROUP_ENUM(_attrname, _fsname, "0;1", _dispname) + +/* + * Requires _current_value_show(), _current_value_show() + * and _possible_values_show() + */ +#define ASUS_ATTR_GROUP_ENUM(_attrname, _fsname, _dispname) \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + static struct kobj_attribute attr_##_attrname##_current_value = \ + __ASUS_ATTR_RW(_attrname, current_value); \ + static struct kobj_attribute attr_##_attrname##_possible_values = \ + __ASUS_ATTR_RO(_attrname, possible_values); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, enum_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_possible_values.attr, \ + &attr_##_attrname##_type.attr, \ + NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +#endif /* _ASUS_ARMOURY_H_ */ diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index c3e90517ce0f..ff98267e5981 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -55,8 +57,6 @@ module_param(fnlock_default, bool, 0444); #define to_asus_wmi_driver(pdrv) \ (container_of((pdrv), struct asus_wmi_driver, platform_driver)) -#define ASUS_WMI_MGMT_GUID "97845ED0-4E6D-11DE-8A39-0800200C9A66" - #define NOTIFY_BRNUP_MIN 0x11 #define NOTIFY_BRNUP_MAX 0x1f #define NOTIFY_BRNDOWN_MIN 0x20 @@ -105,8 +105,6 @@ module_param(fnlock_default, bool, 0444); #define USB_INTEL_XUSB2PR 0xD0 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI 0x9c31 -#define ASUS_ACPI_UID_ASUSWMI "ASUSWMI" - #define WMI_EVENT_MASK 0xFFFF #define FAN_CURVE_POINTS 8 @@ -561,8 +559,8 @@ static int asus_wmi_get_devstate(struct asus_wmi *asus, u32 dev_id, u32 *retval) * * Returns: * * %-ENODEV - method ID is unsupported. - * * %0 - successful and retval is filled. - * * %other - error from WMI call. + * * %0 - successful and retval is filled. + * * %other - error from WMI call. */ int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval) { diff --git a/include/linux/platform_data/x86/asus-wmi-leds-ids.h b/include/linux/platform_data/x86/asus-wmi-leds-ids.h new file mode 100644 index 000000000000..034a039c4e37 --- /dev/null +++ b/include/linux/platform_data/x86/asus-wmi-leds-ids.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H +#define __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H + +#include +#include + +/* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */ +#if IS_REACHABLE(CONFIG_ASUS_WMI) || IS_REACHABLE(CONFIG_HID_ASUS) +static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Zephyrus"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Strix"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt P16"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA403U"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU605M"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC71L"), + }, + }, + { }, +}; +#endif + +#endif /* __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H */ diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index dbd44d9fbb6f..8ea8925a0fc5 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -4,7 +4,9 @@ #include #include -#include + +#define ASUS_WMI_MGMT_GUID "97845ED0-4E6D-11DE-8A39-0800200C9A66" +#define ASUS_ACPI_UID_ASUSWMI "ASUSWMI" /* WMI Methods */ #define ASUS_WMI_METHODID_SPEC 0x43455053 /* BIOS SPECification */ @@ -191,44 +193,4 @@ static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, } #endif -/* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */ -static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Zephyrus"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Strix"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt P16"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "GA403U"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "GU605M"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "RC71L"), - }, - }, - { }, -}; - #endif /* __PLATFORM_DATA_X86_ASUS_WMI_H */ -- cgit v1.2.3 From 628cb03b15f2a0f10534979b3ea9c8befe87c381 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:13 +0100 Subject: platform/x86: asus-armoury: add panel_hd_mode attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add panel_hd_mode to toggle the panel mode between single and high definition modes. Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/20251102215319.3126879-4-denis.benato@linux.dev Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-armoury.c | 6 +++++- include/linux/platform_data/x86/asus-wmi.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c index 81b4972df818..f0cb973a487e 100644 --- a/drivers/platform/x86/asus-armoury.c +++ b/drivers/platform/x86/asus-armoury.c @@ -96,7 +96,8 @@ static struct kobj_attribute pending_reboot = __ATTR_RO(pending_reboot); static bool asus_bios_requires_reboot(struct kobj_attribute *attr) { - return !strcmp(attr->attr.name, "gpu_mux_mode"); + return !strcmp(attr->attr.name, "gpu_mux_mode") || + !strcmp(attr->attr.name, "panel_hd_mode"); } /** @@ -607,6 +608,8 @@ ASUS_ATTR_GROUP_BOOL_RW(mcu_powersave, "mcu_powersave", ASUS_WMI_DEVID_MCU_POWER "Set MCU powersaving mode"); ASUS_ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD, "Set the panel refresh overdrive"); +ASUS_ATTR_GROUP_BOOL_RW(panel_hd_mode, "panel_hd_mode", ASUS_WMI_DEVID_PANEL_HD, + "Set the panel HD mode to UHD<0> or FHD<1>"); ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED, "Show the eGPU connection status"); @@ -620,6 +623,7 @@ static const struct asus_attr_group armoury_attr_groups[] = { { &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND }, { &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE }, { &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD }, + { &panel_hd_mode_attr_group, ASUS_WMI_DEVID_PANEL_HD }, }; static int asus_fw_attr_add(void) diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 8ea8925a0fc5..3cc235b20be4 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -75,6 +75,7 @@ #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO 0x00110019 /* Misc */ +#define ASUS_WMI_DEVID_PANEL_HD 0x0005001C #define ASUS_WMI_DEVID_PANEL_OD 0x00050019 #define ASUS_WMI_DEVID_CAMERA 0x00060013 #define ASUS_WMI_DEVID_LID_FLIP 0x00060062 -- cgit v1.2.3 From bfd3749d489ec0df27ed94ee3dfd9475fea27bf9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Nov 2025 09:18:04 -1000 Subject: sched_ext: Use shorter slice in bypass mode There have been reported cases of bypass mode not making forward progress fast enough. The 20ms default slice is unnecessarily long for bypass mode where the primary goal is ensuring all tasks can make forward progress. Introduce SCX_SLICE_BYPASS set to 5ms and make the scheduler automatically switch to it when entering bypass mode. Also make the bypass slice value tunable through the slice_bypass_us module parameter (adjustable between 100us and 100ms) to make it easier to test whether slice durations are a factor in problem cases. v3: Use READ_ONCE/WRITE_ONCE for scx_slice_dfl access (Dan). v2: Removed slice_dfl_us module parameter. Fixed typos (Andrea). Reviewed-by: Emil Tsalapatis Reviewed-by: Andrea Righi Cc: Dan Schatzberg Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 11 +++++++++++ kernel/sched/ext.c | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 42 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index eb776b094d36..60285c3d07cf 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -17,7 +17,18 @@ enum scx_public_consts { SCX_OPS_NAME_LEN = 128, + /* + * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses + * to set the slice for a task that is selected for execution. + * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice + * refill has been triggered. + * + * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass + * mode. As making forward progress for all tasks is the main goal of + * the bypass mode, a shorter slice is used. + */ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ + SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ }; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 652a364e9e4c..1a9b28dd0961 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -143,6 +143,32 @@ static struct scx_dump_data scx_dump_data = { /* /sys/kernel/sched_ext interface */ static struct kset *scx_kset; +/* + * Parameters that can be adjusted through /sys/module/sched_ext/parameters. + * There usually is no reason to modify these as normal scheduler operation + * shouldn't be affected by them. The knobs are primarily for debugging. + */ +static u64 scx_slice_dfl = SCX_SLICE_DFL; +static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; + +static int set_slice_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); +} + +static const struct kernel_param_ops slice_us_param_ops = { + .set = set_slice_us, + .get = param_get_uint, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "sched_ext." + +module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); +MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); + +#undef MODULE_PARAM_PREFIX + #define CREATE_TRACE_POINTS #include @@ -919,7 +945,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { - p->scx.slice = SCX_SLICE_DFL; + p->scx.slice = READ_ONCE(scx_slice_dfl); __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } @@ -2896,7 +2922,7 @@ void init_scx_entity(struct sched_ext_entity *scx) INIT_LIST_HEAD(&scx->runnable_node); scx->runnable_at = jiffies; scx->ddsp_dsq_id = SCX_DSQ_INVALID; - scx->slice = SCX_SLICE_DFL; + scx->slice = READ_ONCE(scx_slice_dfl); } void scx_pre_fork(struct task_struct *p) @@ -3774,6 +3800,7 @@ static void scx_bypass(bool bypass) WARN_ON_ONCE(scx_bypass_depth <= 0); if (scx_bypass_depth != 1) goto unlock; + WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC); bypass_timestamp = ktime_get_ns(); if (sch) scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); @@ -3782,6 +3809,7 @@ static void scx_bypass(bool bypass) WARN_ON_ONCE(scx_bypass_depth < 0); if (scx_bypass_depth != 0) goto unlock; + WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL); if (sch) scx_add_event(sch, SCX_EV_BYPASS_DURATION, ktime_get_ns() - bypass_timestamp); @@ -4780,7 +4808,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) queue_flags |= DEQUEUE_CLASS; scoped_guard (sched_change, p, queue_flags) { - p->scx.slice = SCX_SLICE_DFL; + p->scx.slice = READ_ONCE(scx_slice_dfl); p->sched_class = new_class; } } -- cgit v1.2.3 From 61debc251c1c9150c7bdfd5c028bc2d078e17d22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Nov 2025 09:18:06 -1000 Subject: sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode Bypass mode routes tasks through fallback dispatch queues. Originally a single global DSQ, b7b3b2dbae73 ("sched_ext: Split the global DSQ per NUMA node") changed this to per-node DSQs to resolve NUMA-related livelocks. Dan Schatzberg found per-node DSQs can still livelock when many threads are pinned to different small CPU subsets: each CPU must scan many incompatible tasks to find runnable ones, causing severe contention with high CPU counts. Switch to per-CPU bypass DSQs. Each task queues on its current CPU. Default idle CPU selection and direct dispatch handle most cases well. This introduces a failure mode when tasks concentrate on one CPU in over-saturated systems. If the BPF scheduler severely skews placement before triggering bypass, that CPU's queue may be too long to drain, causing RCU stalls. A load balancer in a future patch will address this. The bypass DSQ is separate from local DSQ to enable load balancing: local DSQs use rq locks, preventing efficient scanning and transfer across CPUs, especially problematic when systems are already contended. v2: Clarified why bypass DSQ is separate from local DSQ (Andrea Righi). Reported-by: Dan Schatzberg Reviewed-by: Dan Schatzberg Reviewed-by: Andrea Righi Reviewed-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 16 +++++++++++++--- kernel/sched/sched.h | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 60285c3d07cf..3d3216ff9188 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -57,6 +57,7 @@ enum scx_dsq_id_flags { SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 43083602c15e..747391a3f6e3 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1298,7 +1298,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (scx_rq_bypassing(rq)) { __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); - goto global; + goto bypass; } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -1356,6 +1356,9 @@ local: global: dsq = find_global_dsq(sch, p); goto enqueue; +bypass: + dsq = &task_rq(p)->scx.bypass_dsq; + goto enqueue; enqueue: /* @@ -2154,8 +2157,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (consume_global_dsq(sch, rq)) goto has_tasks; - if (unlikely(!SCX_HAS_OP(sch, dispatch)) || - scx_rq_bypassing(rq) || !scx_rq_online(rq)) + if (scx_rq_bypassing(rq)) { + if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq)) + goto has_tasks; + else + goto no_tasks; + } + + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) goto no_tasks; dspc->rq = rq; @@ -5371,6 +5380,7 @@ void __init init_sched_ext_class(void) int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 27aae2a298f8..5991133a4849 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -808,6 +808,7 @@ struct scx_rq { struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; + struct scx_dispatch_q bypass_dsq; }; #endif /* CONFIG_SCHED_CLASS_EXT */ -- cgit v1.2.3 From 582f700e1bdc5978f41e3d8d65d3e16e34e9be8a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Nov 2025 09:18:12 -1000 Subject: sched_ext: Hook up hardlockup detector A poorly behaving BPF scheduler can trigger hard lockup. For example, on a large system with many tasks pinned to different subsets of CPUs, if the BPF scheduler puts all tasks in a single DSQ and lets all CPUs at it, the DSQ lock can be contended to the point where hardlockup triggers. Unfortunately, hardlockup can be the first signal out of such situations, thus requiring hardlockup handling. Hook scx_hardlockup() into the hardlockup detector to try kicking out the current scheduler in an attempt to recover the system to a good state. The handling strategy can delay watchdog taking its own action by one polling period; however, given that the only remediation for hardlockup is crash, this is likely an acceptable trade-off. v2: Add missing dummy scx_hardlockup() definition for !CONFIG_SCHED_CLASS_EXT (kernel test bot). Reported-by: Dan Schatzberg Cc: Emil Tsalapatis Cc: Douglas Anderson Cc: Andrew Morton Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 2 ++ kernel/sched/ext.c | 18 ++++++++++++++++++ kernel/watchdog.c | 9 +++++++++ 3 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 3d3216ff9188..d6c152475f5b 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -223,6 +223,7 @@ struct sched_ext_entity { void sched_ext_dead(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); void scx_softlockup(u32 dur_s); +bool scx_hardlockup(void); bool scx_rcu_cpu_stall(void); #else /* !CONFIG_SCHED_CLASS_EXT */ @@ -230,6 +231,7 @@ bool scx_rcu_cpu_stall(void); static inline void sched_ext_dead(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} +static inline bool scx_hardlockup(void) { return false; } static inline bool scx_rcu_cpu_stall(void) { return false; } #endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 85bb052459ec..b5c87a03f112 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3711,6 +3711,24 @@ void scx_softlockup(u32 dur_s) smp_processor_id(), dur_s); } +/** + * scx_hardlockup - sched_ext hardlockup handler + * + * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting + * numerous affinitized tasks in a single queue and directing all CPUs at it. + * Try kicking out the current scheduler in an attempt to recover the system to + * a good state before taking more drastic actions. + */ +bool scx_hardlockup(void) +{ + if (!handle_lockup("hard lockup - CPU %d", smp_processor_id())) + return false; + + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", + smp_processor_id()); + return true; +} + /** * scx_bypass - [Un]bypass scx_ops and guarantee forward progress * @bypass: true for bypass, false for unbypass diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5b62d1002783..8dfac4a8f587 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -196,6 +196,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) #ifdef CONFIG_SYSFS ++hardlockup_count; #endif + /* + * A poorly behaving BPF scheduler can trigger hard lockup by + * e.g. putting numerous affinitized tasks in a single queue and + * directing all CPUs at it. The following call can return true + * only once when sched_ext is enabled and will immediately + * abort the BPF scheduler and print out a warning message. + */ + if (scx_hardlockup()) + return; /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) -- cgit v1.2.3 From d2974cc79f7139cc851b84ad4f77805e93c40fe1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Nov 2025 09:18:14 -1000 Subject: sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR macro in preparation for additional users. Reviewed-by: Emil Tsalapatis Cc: Dan Schatzberg Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 7 +++++++ kernel/sched/ext.c | 5 ++--- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index d6c152475f5b..70ee5c28a74d 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -149,6 +149,13 @@ struct scx_dsq_list_node { u32 priv; /* can be used by iter cursor */ }; +#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ + (struct scx_dsq_list_node) { \ + .node = LIST_HEAD_INIT((__node).node), \ + .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ + .priv = (__priv), \ + } + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index b5c87a03f112..56946aceeb28 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6253,9 +6253,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (!kit->dsq) return -ENOENT; - INIT_LIST_HEAD(&kit->cursor.node); - kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; - kit->cursor.priv = READ_ONCE(kit->dsq->seq); + kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags, + READ_ONCE(kit->dsq->seq)); return 0; } -- cgit v1.2.3 From 05d6f1cc2dc214c1491181be13f37d2a3a26f694 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 13 Oct 2025 11:12:02 +0200 Subject: compiler.h: remove ARCH_SEL() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its last user was removed in commit 8ea815399c3f ("compiler: remove __ADDRESSABLE_ASM{_STR,}() again"). Link: https://lkml.kernel.org/r/20251013-arch-sel-v1-1-7eef9b22ceb0@linutronix.de Signed-off-by: Thomas Weißschuh Cc: Luc Van Oostenryck Signed-off-by: Andrew Morton --- include/linux/compiler.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5b45ea7dff3e..a9a2f8aae821 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -269,12 +269,6 @@ static inline void *offset_to_ptr(const int *off) #endif /* __ASSEMBLY__ */ -#ifdef CONFIG_64BIT -#define ARCH_SEL(a,b) a -#else -#define ARCH_SEL(a,b) b -#endif - /* * Force the compiler to emit 'sym' as a symbol, so that we can reference * it from inline assembler. Necessary in case 'sym' could be inlined -- cgit v1.2.3 From adc15829fb73e402903b7030729263b6ee4a7232 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Thu, 16 Oct 2025 19:58:31 +0530 Subject: crash: let architecture decide crash memory export to iomem_resource With the generic crashkernel reservation, the kernel emits the following warning on powerpc: WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/mem.c:341 add_system_ram_resources+0xfc/0x180 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-auto-12607-g5472d60c129f #1 VOLUNTARY Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.01 (NH1110_069) hv:phyp pSeries NIP: c00000000201de3c LR: c00000000201de34 CTR: 0000000000000000 REGS: c000000127cef8a0 TRAP: 0700 Not tainted (6.17.0-auto-12607-g5472d60c129f) MSR: 8000000002029033 CR: 84000840 XER: 20040010 CFAR: c00000000017eed0 IRQMASK: 0 GPR00: c00000000201de34 c000000127cefb40 c0000000016a8100 0000000000000001 GPR04: c00000012005aa00 0000000020000000 c000000002b705c8 0000000000000000 GPR08: 000000007fffffff fffffffffffffff0 c000000002db8100 000000011fffffff GPR12: c00000000201dd40 c000000002ff0000 c0000000000112bc 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 c0000000015a3808 GPR24: c00000000200468c c000000001699888 0000000000000106 c0000000020d1950 GPR28: c0000000014683f8 0000000081000200 c0000000015c1868 c000000002b9f710 NIP [c00000000201de3c] add_system_ram_resources+0xfc/0x180 LR [c00000000201de34] add_system_ram_resources+0xf4/0x180 Call Trace: add_system_ram_resources+0xf4/0x180 (unreliable) do_one_initcall+0x60/0x36c do_initcalls+0x120/0x220 kernel_init_freeable+0x23c/0x390 kernel_init+0x34/0x26c ret_from_kernel_user_thread+0x14/0x1c This warning occurs due to a conflict between crashkernel and System RAM iomem resources. The generic crashkernel reservation adds the crashkernel memory range to /proc/iomem during early initialization. Later, all memblock ranges are added to /proc/iomem as System RAM. If the crashkernel region overlaps with any memblock range, it causes a conflict while adding those memblock regions as iomem resources, triggering the above warning. The conflicting memblock regions are then omitted from /proc/iomem. For example, if the following crashkernel region is added to /proc/iomem: 20000000-11fffffff : Crash kernel then the following memblock regions System RAM regions fail to be inserted: 00000000-7fffffff : System RAM 80000000-257fffffff : System RAM Fix this by not adding the crashkernel memory to /proc/iomem on powerpc. Introduce an architecture hook to let each architecture decide whether to export the crashkernel region to /proc/iomem. For more info checkout commit c40dd2f766440 ("powerpc: Add System RAM to /proc/iomem") and commit bce074bdbc36 ("powerpc: insert System RAM resource to prevent crashkernel conflict") Note: Before switching to the generic crashkernel reservation, powerpc never exported the crashkernel region to /proc/iomem. Link: https://lkml.kernel.org/r/20251016142831.144515-1-sourabhjain@linux.ibm.com Fixes: e3185ee438c2 ("powerpc/crash: use generic crashkernel reservation"). Signed-off-by: Sourabh Jain Reported-by: Venkat Rao Bagalkote Closes: https://lore.kernel.org/all/90937fe0-2e76-4c82-b27e-7b8a7fe3ac69@linux.ibm.com/ Tested-by: Venkat Rao Bagalkote Cc: Baoquan he Cc: Hari Bathini Cc: Madhavan Srinivasan Cc: Mahesh Salgaonkar Cc: Michael Ellerman Cc: Ritesh Harjani (IBM) Cc: Vivek Goyal Cc: Dave Young Cc: Mike Rapoport Cc: Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/crash_reserve.h | 8 ++++++++ include/linux/crash_reserve.h | 6 ++++++ kernel/crash_reserve.c | 3 +++ 3 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/arch/powerpc/include/asm/crash_reserve.h b/arch/powerpc/include/asm/crash_reserve.h index 6467ce29b1fa..d1b570ddbf98 100644 --- a/arch/powerpc/include/asm/crash_reserve.h +++ b/arch/powerpc/include/asm/crash_reserve.h @@ -5,4 +5,12 @@ /* crash kernel regions are Page size agliged */ #define CRASH_ALIGN PAGE_SIZE +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +static inline bool arch_add_crash_res_to_iomem(void) +{ + return false; +} +#define arch_add_crash_res_to_iomem arch_add_crash_res_to_iomem +#endif + #endif /* _ASM_POWERPC_CRASH_RESERVE_H */ diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h index 7b44b41d0a20..f0dc03d94ca2 100644 --- a/include/linux/crash_reserve.h +++ b/include/linux/crash_reserve.h @@ -32,6 +32,12 @@ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, void __init reserve_crashkernel_cma(unsigned long long cma_size); #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef arch_add_crash_res_to_iomem +static inline bool arch_add_crash_res_to_iomem(void) +{ + return true; +} +#endif #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) #endif diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 87bf4d41eabb..62e60e0223cf 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsigned long long cma_size) #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { + if (!arch_add_crash_res_to_iomem()) + return 0; + if (crashk_res.start < crashk_res.end) insert_resource(&iomem_resource, &crashk_res); -- cgit v1.2.3 From 37ade54f386c829597f74b54bad335c12bd2a698 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 22 Oct 2025 10:28:04 +0200 Subject: taint/module: remove unnecessary taint_flag.module field The TAINT_RANDSTRUCT and TAINT_FWCTL flags are mistakenly set in the taint_flags table as per-module flags. While this can be trivially corrected, the issue can be avoided altogether by removing the taint_flag.module field. This is possible because, since commit 7fd8329ba502 ("taint/module: Clean up global and module taint flags handling") in 2016, the handling of module taint flags has been fully generic. Specifically, module_flags_taint() can print all flags, and the required output buffer size is properly defined in terms of TAINT_FLAGS_COUNT. The actual per-module flags are always those added to module.taints by calls to add_taint_module(). Link: https://lkml.kernel.org/r/20251022082938.26670-1-petr.pavlu@suse.com Signed-off-by: Petr Pavlu Acked-by: Petr Mladek Reviewed-by: Randy Dunlap Cc: Aaron Tomlin Cc: Luis Chamberalin Cc: Petr Pavlu Cc: Sami Tolvanen Signed-off-by: Andrew Morton --- include/linux/panic.h | 1 - kernel/module/main.c | 2 +- kernel/panic.c | 46 +++++++++++++++++++++------------------------- 3 files changed, 22 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/panic.h b/include/linux/panic.h index 6f972a66c13e..a00bc0937698 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -86,7 +86,6 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout) struct taint_flag { char c_true; /* character printed when tainted */ char c_false; /* character printed when not tainted */ - bool module; /* also show as a per-module taint flag */ const char *desc; /* verbose description of the set taint flag */ }; diff --git a/kernel/module/main.c b/kernel/module/main.c index c66b26184936..6f219751df7e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -954,7 +954,7 @@ size_t module_flags_taint(unsigned long taints, char *buf) int i; for (i = 0; i < TAINT_FLAGS_COUNT; i++) { - if (taint_flags[i].module && test_bit(i, &taints)) + if (test_bit(i, &taints)) buf[l++] = taint_flags[i].c_true; } diff --git a/kernel/panic.c b/kernel/panic.c index ec59cade1f83..ffceb6f13935 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -628,17 +628,13 @@ void panic(const char *fmt, ...) } EXPORT_SYMBOL(panic); -#define TAINT_FLAG(taint, _c_true, _c_false, _module) \ +#define TAINT_FLAG(taint, _c_true, _c_false) \ [ TAINT_##taint ] = { \ .c_true = _c_true, .c_false = _c_false, \ - .module = _module, \ .desc = #taint, \ } /* - * TAINT_FORCED_RMMOD could be a per-module flag but the module - * is being removed anyway. - * * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT, * please also modify tools/debugging/kernel-chktaint and * Documentation/admin-guide/tainted-kernels.rst, including its @@ -646,26 +642,26 @@ EXPORT_SYMBOL(panic); * /proc/sys/kernel/tainted. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G', true), - TAINT_FLAG(FORCED_MODULE, 'F', ' ', true), - TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' ', false), - TAINT_FLAG(FORCED_RMMOD, 'R', ' ', false), - TAINT_FLAG(MACHINE_CHECK, 'M', ' ', false), - TAINT_FLAG(BAD_PAGE, 'B', ' ', false), - TAINT_FLAG(USER, 'U', ' ', false), - TAINT_FLAG(DIE, 'D', ' ', false), - TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' ', false), - TAINT_FLAG(WARN, 'W', ' ', false), - TAINT_FLAG(CRAP, 'C', ' ', true), - TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' ', false), - TAINT_FLAG(OOT_MODULE, 'O', ' ', true), - TAINT_FLAG(UNSIGNED_MODULE, 'E', ' ', true), - TAINT_FLAG(SOFTLOCKUP, 'L', ' ', false), - TAINT_FLAG(LIVEPATCH, 'K', ' ', true), - TAINT_FLAG(AUX, 'X', ' ', true), - TAINT_FLAG(RANDSTRUCT, 'T', ' ', true), - TAINT_FLAG(TEST, 'N', ' ', true), - TAINT_FLAG(FWCTL, 'J', ' ', true), + TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'), + TAINT_FLAG(FORCED_MODULE, 'F', ' '), + TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' '), + TAINT_FLAG(FORCED_RMMOD, 'R', ' '), + TAINT_FLAG(MACHINE_CHECK, 'M', ' '), + TAINT_FLAG(BAD_PAGE, 'B', ' '), + TAINT_FLAG(USER, 'U', ' '), + TAINT_FLAG(DIE, 'D', ' '), + TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' '), + TAINT_FLAG(WARN, 'W', ' '), + TAINT_FLAG(CRAP, 'C', ' '), + TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' '), + TAINT_FLAG(OOT_MODULE, 'O', ' '), + TAINT_FLAG(UNSIGNED_MODULE, 'E', ' '), + TAINT_FLAG(SOFTLOCKUP, 'L', ' '), + TAINT_FLAG(LIVEPATCH, 'K', ' '), + TAINT_FLAG(AUX, 'X', ' '), + TAINT_FLAG(RANDSTRUCT, 'T', ' '), + TAINT_FLAG(TEST, 'N', ' '), + TAINT_FLAG(FWCTL, 'J', ' '), }; #undef TAINT_FLAG -- cgit v1.2.3 From d99dc586ca7c7729450af2ed39ca1483c0eb7b5c Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Thu, 23 Oct 2025 13:16:06 -0400 Subject: uaccess: decouple INLINE_COPY_FROM_USER and CONFIG_RUST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1f9a8286bc0c ("uaccess: always export _copy_[from|to]_user with CONFIG_RUST") exports _copy_{from,to}_user() unconditionally, if RUST is enabled. This pollutes exported symbols namespace, and spreads RUST ifdefery in core files. It's better to declare a corresponding helper under the rust/helpers, similarly to how non-underscored copy_{from,to}_user() is handled. [yury.norov@gmail.com: drop rust part of comment for _copy_from_user(), per Alice] Link: https://lkml.kernel.org/r/20251024154754.99768-1-yury.norov@gmail.com Link: https://lkml.kernel.org/r/20251023171607.1171534-1-yury.norov@gmail.com Signed-off-by: Yury Norov (NVIDIA) Acked-by: Arnd Bergmann Acked-by: Miguel Ojeda Reviewed-by: Alice Ryhl Tested-by: Alice Ryhl Cc: Alex Gaynor Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Danilo Krummrich Cc: Gary Guo Cc: John Hubbard Cc: Trevor Gross Signed-off-by: Andrew Morton --- include/linux/uaccess.h | 2 -- lib/usercopy.c | 4 ++-- rust/helpers/uaccess.c | 12 ++++++++++++ 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1beb5b395d81..01cbd7dd0ba3 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -152,8 +152,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) * directly in the normal copy_to/from_user(), the other ones go * through an extern _copy_to/from_user(), which expands the same code * here. - * - * Rust code always uses the extern definition. */ static inline __must_check unsigned long _inline_copy_from_user(void *to, const void __user *from, unsigned long n) diff --git a/lib/usercopy.c b/lib/usercopy.c index 7b17b83c8042..b00a3a957de6 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -12,7 +12,7 @@ /* out-of-line parts */ -#if !defined(INLINE_COPY_FROM_USER) || defined(CONFIG_RUST) +#if !defined(INLINE_COPY_FROM_USER) unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { return _inline_copy_from_user(to, from, n); @@ -20,7 +20,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n EXPORT_SYMBOL(_copy_from_user); #endif -#if !defined(INLINE_COPY_TO_USER) || defined(CONFIG_RUST) +#if !defined(INLINE_COPY_TO_USER) unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { return _inline_copy_to_user(to, from, n); diff --git a/rust/helpers/uaccess.c b/rust/helpers/uaccess.c index f49076f813cd..4629b2d15529 100644 --- a/rust/helpers/uaccess.c +++ b/rust/helpers/uaccess.c @@ -13,3 +13,15 @@ unsigned long rust_helper_copy_to_user(void __user *to, const void *from, { return copy_to_user(to, from, n); } + +#ifdef INLINE_COPY_FROM_USER +unsigned long rust_helper__copy_from_user(void *to, const void __user *from, unsigned long n) +{ + return _inline_copy_from_user(to, from, n); +} + +unsigned long rust_helper__copy_to_user(void __user *to, const void *from, unsigned long n) +{ + return _inline_copy_to_user(to, from, n); +} +#endif -- cgit v1.2.3 From 6c2e6e2c1af1809d1d9cdbd50ac80f54f5995bdb Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 25 Oct 2025 16:00:03 +0800 Subject: dynamic_debug: add support for print stack In practical problem diagnosis, especially during the boot phase, it is often desirable to know the call sequence. However, currently, apart from adding print statements and recompiling the kernel, there seems to be no good alternative. If dynamic_debug supported printing the call stack, it would be very helpful for diagnosing issues. This patch add support '+d' for dump stack. Link: https://lkml.kernel.org/r/20251025080003.312536-1-yebin@huaweicloud.com Signed-off-by: Ye Bin Cc: Jason Baron Cc: Jim Cromie Signed-off-by: Andrew Morton --- Documentation/admin-guide/dynamic-debug-howto.rst | 5 +++-- include/linux/dynamic_debug.h | 17 ++++++++++++++--- lib/dynamic_debug.c | 1 + 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 7c036590cd07..095a63892257 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -223,12 +223,13 @@ The flags are:: f Include the function name s Include the source file name l Include line number + d Include call trace For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only the ``p`` flag has meaning, other flags are ignored. -Note the regexp ``^[-+=][fslmpt_]+$`` matches a flags specification. -To clear all flags at once, use ``=_`` or ``-fslmpt``. +Note the regexp ``^[-+=][fslmptd_]+$`` matches a flags specification. +To clear all flags at once, use ``=_`` or ``-fslmptd``. Debug messages during Boot Process diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index ff44ec346162..05743900a116 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -38,11 +38,12 @@ struct _ddebug { #define _DPRINTK_FLAGS_INCL_LINENO (1<<3) #define _DPRINTK_FLAGS_INCL_TID (1<<4) #define _DPRINTK_FLAGS_INCL_SOURCENAME (1<<5) +#define _DPRINTK_FLAGS_INCL_STACK (1<<6) #define _DPRINTK_FLAGS_INCL_ANY \ (_DPRINTK_FLAGS_INCL_MODNAME | _DPRINTK_FLAGS_INCL_FUNCNAME |\ _DPRINTK_FLAGS_INCL_LINENO | _DPRINTK_FLAGS_INCL_TID |\ - _DPRINTK_FLAGS_INCL_SOURCENAME) + _DPRINTK_FLAGS_INCL_SOURCENAME | _DPRINTK_FLAGS_INCL_STACK) #if defined DEBUG #define _DPRINTK_FLAGS_DEFAULT _DPRINTK_FLAGS_PRINT @@ -160,6 +161,12 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, const struct ib_device *ibdev, const char *fmt, ...); +#define __dynamic_dump_stack(desc) \ +{ \ + if (desc.flags & _DPRINTK_FLAGS_INCL_STACK) \ + dump_stack(); \ +} + #define DEFINE_DYNAMIC_DEBUG_METADATA_CLS(name, cls, fmt) \ static struct _ddebug __aligned(8) \ __section("__dyndbg") name = { \ @@ -220,8 +227,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, */ #define __dynamic_func_call_cls(id, cls, fmt, func, ...) do { \ DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt); \ - if (DYNAMIC_DEBUG_BRANCH(id)) \ + if (DYNAMIC_DEBUG_BRANCH(id)) { \ func(&id, ##__VA_ARGS__); \ + __dynamic_dump_stack(id); \ + } \ } while (0) #define __dynamic_func_call(id, fmt, func, ...) \ __dynamic_func_call_cls(id, _DPRINTK_CLASS_DFLT, fmt, \ @@ -229,8 +238,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, #define __dynamic_func_call_cls_no_desc(id, cls, fmt, func, ...) do { \ DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt); \ - if (DYNAMIC_DEBUG_BRANCH(id)) \ + if (DYNAMIC_DEBUG_BRANCH(id)) { \ func(__VA_ARGS__); \ + __dynamic_dump_stack(id); \ + } \ } while (0) #define __dynamic_func_call_no_desc(id, fmt, func, ...) \ __dynamic_func_call_cls_no_desc(id, _DPRINTK_CLASS_DFLT, \ diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 5a007952f7f2..7d7892e57a01 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -95,6 +95,7 @@ static const struct { unsigned flag:8; char opt_char; } opt_array[] = { { _DPRINTK_FLAGS_INCL_SOURCENAME, 's' }, { _DPRINTK_FLAGS_INCL_LINENO, 'l' }, { _DPRINTK_FLAGS_INCL_TID, 't' }, + { _DPRINTK_FLAGS_INCL_STACK, 'd' }, { _DPRINTK_FLAGS_NONE, '_' }, }; -- cgit v1.2.3 From a0b8c6af29a4be3ca2ff9a95cf71e54db5d73e65 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 24 Oct 2025 21:51:20 +0100 Subject: lib/xxhash: remove more unused xxh functions xxh32_reset() and xxh32_copy_state() are unused, and with those gone, the xxh32_state struct is also unused. xxh64_copy_state() is also unused. Remove them all. (Also fixes a comment above the xxh64_state that referred to it as xxh32_state). Link: https://lkml.kernel.org/r/20251024205120.454508-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Suggested-by: Christoph Hellwig Reviewed-by: Kuan-Wei Chiu Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/xxhash.h | 46 +--------------------------------------------- lib/xxhash.c | 29 ----------------------------- 2 files changed, 1 insertion(+), 74 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h index 27f57eca8cb1..587122e2c29c 100644 --- a/include/linux/xxhash.h +++ b/include/linux/xxhash.h @@ -141,21 +141,7 @@ static inline unsigned long xxhash(const void *input, size_t length, */ /** - * struct xxh32_state - private xxh32 state, do not use members directly - */ -struct xxh32_state { - uint32_t total_len_32; - uint32_t large_len; - uint32_t v1; - uint32_t v2; - uint32_t v3; - uint32_t v4; - uint32_t mem32[4]; - uint32_t memsize; -}; - -/** - * struct xxh32_state - private xxh64 state, do not use members directly + * struct xxh64_state - private xxh64 state, do not use members directly */ struct xxh64_state { uint64_t total_len; @@ -167,16 +153,6 @@ struct xxh64_state { uint32_t memsize; }; -/** - * xxh32_reset() - reset the xxh32 state to start a new hashing operation - * - * @state: The xxh32 state to reset. - * @seed: Initialize the hash state with this seed. - * - * Call this function on any xxh32_state to prepare for a new hashing operation. - */ -void xxh32_reset(struct xxh32_state *state, uint32_t seed); - /** * xxh64_reset() - reset the xxh64 state to start a new hashing operation * @@ -210,24 +186,4 @@ int xxh64_update(struct xxh64_state *state, const void *input, size_t length); */ uint64_t xxh64_digest(const struct xxh64_state *state); -/*-************************** - * Utils - ***************************/ - -/** - * xxh32_copy_state() - copy the source state into the destination state - * - * @src: The source xxh32 state. - * @dst: The destination xxh32 state. - */ -void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src); - -/** - * xxh64_copy_state() - copy the source state into the destination state - * - * @src: The source xxh64 state. - * @dst: The destination xxh64 state. - */ -void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src); - #endif /* XXHASH_H */ diff --git a/lib/xxhash.c b/lib/xxhash.c index cf629766f376..4125b3e3cf7f 100644 --- a/lib/xxhash.c +++ b/lib/xxhash.c @@ -73,21 +73,6 @@ static const uint64_t PRIME64_3 = 1609587929392839161ULL; static const uint64_t PRIME64_4 = 9650029242287828579ULL; static const uint64_t PRIME64_5 = 2870177450012600261ULL; -/*-************************** - * Utils - ***************************/ -void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src) -{ - memcpy(dst, src, sizeof(*dst)); -} -EXPORT_SYMBOL(xxh32_copy_state); - -void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src) -{ - memcpy(dst, src, sizeof(*dst)); -} -EXPORT_SYMBOL(xxh64_copy_state); - /*-*************************** * Simple Hash Functions ****************************/ @@ -239,20 +224,6 @@ EXPORT_SYMBOL(xxh64); /*-************************************************** * Advanced Hash Functions ***************************************************/ -void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed) -{ - /* use a local state for memcpy() to avoid strict-aliasing warnings */ - struct xxh32_state state; - - memset(&state, 0, sizeof(state)); - state.v1 = seed + PRIME32_1 + PRIME32_2; - state.v2 = seed + PRIME32_2; - state.v3 = seed + 0; - state.v4 = seed - PRIME32_1; - memcpy(statePtr, &state, sizeof(state)); -} -EXPORT_SYMBOL(xxh32_reset); - void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) { /* use a local state for memcpy() to avoid strict-aliasing warnings */ -- cgit v1.2.3 From 113557b0406818a8a5df3479b0a89125d2b2a04c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:17 -0400 Subject: vfio: Provide a get_region_info op Instead of hooking the general ioctl op, have the core code directly decode VFIO_DEVICE_GET_REGION_INFO and call an op just for it. This is intended to allow mechanical changes to the drivers to pull their VFIO_DEVICE_GET_REGION_INFO int oa function. Later patches will improve the function signature to consolidate more code. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 9 ++++++--- drivers/vfio/vfio_main.c | 7 +++++++ include/linux/vfio.h | 2 ++ include/linux/vfio_pci_core.h | 2 ++ 4 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc..1dc350003f07 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -996,9 +996,11 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, - struct vfio_region_info __user *arg) +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) { + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; @@ -1132,6 +1134,7 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, struct vfio_irq_info __user *arg) @@ -1458,7 +1461,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(vdev, uarg); + return vfio_pci_ioctl_get_region_info(core_vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 38c8e9350a60..a390163ce706 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1296,7 +1296,14 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_device_feature(device, uptr); break; + case VFIO_DEVICE_GET_REGION_INFO: + if (!device->ops->get_region_info) + goto ioctl_fallback; + ret = device->ops->get_region_info(device, uptr); + break; + default: +ioctl_fallback: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; else diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee..be5fcf8432e8 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -132,6 +132,8 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); + int (*get_region_info)(struct vfio_device *vdev, + struct vfio_region_info __user *arg); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2..160bc2e31ece 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -115,6 +115,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, -- cgit v1.2.3 From 775f726a742a60d8d0ed2b4733a5b6a796d9d1dd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:31 -0400 Subject: vfio: Add get_region_info_caps op This op does the copy to/from user for the info and can return back a cap chain through a vfio_info_cap * result. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/15-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 56 ++++++++++++++++++++++++++++++++++++++++++++---- include/linux/vfio.h | 4 ++++ 2 files changed, 56 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f056e82ba350..48d034aede46 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1259,6 +1259,57 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, } } +static long vfio_get_region_info(struct vfio_device *device, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info info = {}; + struct vfio_info_cap caps = {}; + int ret; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + if (device->ops->get_region_info_caps) { + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, + caps.size)) { + ret = -EFAULT; + goto out_free; + } + info.cap_offset = sizeof(info); + } + } + + if (copy_to_user(arg, &info, minsz)) { + ret = -EFAULT; + goto out_free; + } + } else if (device->ops->get_region_info) { + ret = device->ops->get_region_info(device, arg); + if (ret) + return ret; + } else { + return -EINVAL; + } + +out_free: + kfree(caps.buf); + return ret; +} + static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -1297,10 +1348,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, break; case VFIO_DEVICE_GET_REGION_INFO: - if (unlikely(!device->ops->get_region_info)) - ret = -EINVAL; - else - ret = device->ops->get_region_info(device, uptr); + ret = vfio_get_region_info(device, uptr); break; default: diff --git a/include/linux/vfio.h b/include/linux/vfio.h index be5fcf8432e8..6311ddc83770 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -21,6 +21,7 @@ struct kvm; struct iommufd_ctx; struct iommufd_device; struct iommufd_access; +struct vfio_info_cap; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -134,6 +135,9 @@ struct vfio_device_ops { unsigned long arg); int (*get_region_info)(struct vfio_device *vdev, struct vfio_region_info __user *arg); + int (*get_region_info_caps)(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); -- cgit v1.2.3 From 1b0ecb5baf4af3baa8627144bbcf9848806aa5f1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:35 -0400 Subject: vfio/pci: Convert all PCI drivers to get_region_info_caps Since the core function signature changes it has to flow up to all drivers. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Brett Creeley Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/19-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 30 +++---- drivers/vfio/pci/mlx5/main.c | 2 +- drivers/vfio/pci/nvgrace-gpu/main.c | 51 +++--------- drivers/vfio/pci/pds/vfio_dev.c | 2 +- drivers/vfio/pci/qat/main.c | 2 +- drivers/vfio/pci/vfio_pci.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 103 ++++++++++--------------- drivers/vfio/pci/virtio/common.h | 3 +- drivers/vfio/pci/virtio/legacy_io.c | 26 ++----- drivers/vfio/pci/virtio/main.c | 6 +- include/linux/vfio_pci_core.h | 3 +- 11 files changed, 80 insertions(+), 150 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 1fd3a2075ee4..cf45f6370c36 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1386,32 +1386,22 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, } static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); - if (info.index != VFIO_PCI_BAR2_REGION_INDEX) - return vfio_pci_ioctl_get_region_info(core_vdev, arg); + info->size = hisi_acc_get_resource_len(vdev, info->index); - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - - info.size = hisi_acc_get_resource_len(vdev, info.index); - - info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1610,7 +1600,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = hisi_acc_vfio_ioctl_get_region, + .get_region_info_caps = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, @@ -1631,7 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index b7f941f8047e..9c5970411d07 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,7 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index cab743a30dc3..5a6f77d5c81e 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -205,34 +205,25 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, return 0; } -static int -nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse; - struct vfio_region_info info; struct mem_region *memregion; u32 size; int ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - /* * Request to determine the BAR region information. Send the * GPU memory information. */ - memregion = nvgrace_gpu_memregion(info.index, nvdev); + memregion = nvgrace_gpu_memregion(info->index, nvdev); if (!memregion) - return vfio_pci_ioctl_get_region_info(core_vdev, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); size = struct_size(sparse, areas, 1); @@ -251,40 +242,22 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; sparse->header.version = 1; - ret = vfio_info_add_capability(&caps, &sparse->header, size); + ret = vfio_info_add_capability(caps, &sparse->header, size); kfree(sparse); if (ret) return ret; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); /* * The region memory size may not be power-of-2 aligned. * Given that the memory is a BAR and may not be * aligned, roundup to the next power-of-2. */ - info.size = memregion->bar_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | + info->size = memregion->bar_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - kfree(caps.buf); - } - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, @@ -686,7 +659,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, - .get_region_info = nvgrace_gpu_ioctl_get_region_info, + .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, @@ -707,7 +680,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index 1946bc75d99b..be103c74e969 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,7 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index 8452d9c1d11d..8fbdf7c6d666 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,7 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2d9122efc10b..a3e49d42c771 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -132,7 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index f21d9026068c..57c0766fb9f8 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -997,43 +997,35 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, } int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pdev->cfg_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pci_resource_len(pdev, info->index); + if (!info->size) { + info->flags = 0; break; } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info->index]) { + info->flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info->index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, caps); if (ret) return ret; } @@ -1045,9 +1037,9 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, size_t size; u16 cmd; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = 0; - info.size = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = 0; + info->size = 0; if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { /* @@ -1057,16 +1049,17 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, cmd = vfio_pci_memory_lock_and_enable(vdev); io = pci_map_rom(pdev, &size); if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report the BAR size, not the ROM size. */ - info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + info->size = pci_resource_len(pdev, + PCI_ROM_RESOURCE); pci_unmap_rom(pdev, io); } vfio_pci_memory_unlock_and_restore(vdev, cmd); } else if (pdev->rom && pdev->romlen) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report BAR size as power of two. */ - info.size = roundup_pow_of_two(pdev->romlen); + info->size = roundup_pow_of_two(pdev->romlen); } break; @@ -1075,10 +1068,10 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, if (!vdev->has_vga) return -EINVAL; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0xc0000; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; default: { @@ -1087,52 +1080,34 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, .header.version = 1 }; - if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; - info.index = array_index_nospec( - info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); - i = info.index - VFIO_PCI_NUM_REGIONS; + i = info->index - VFIO_PCI_NUM_REGIONS; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vdev->region[i].size; + info->flags = vdev->region[i].flags; cap_type.type = vdev->region[i].type; cap_type.subtype = vdev->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; if (vdev->region[i].ops->add_capability) { ret = vdev->region[i].ops->add_capability( - vdev, &vdev->region[i], &caps); + vdev, &vdev->region[i], caps); if (ret) return ret; } } } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(*arg); - } - - kfree(caps.buf); - } - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index a10f2d92cb62..cb3d5e57d3a3 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -110,7 +110,8 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index d735d5c4bd77..1ed349a55629 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,29 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_region_info info = {}; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR0_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = virtvdev->bar0_virtual_buf_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; - default: - return vfio_pci_ioctl_get_region_info(core_vdev, arg); - } + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = virtvdev->bar0_virtual_buf_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + return 0; } static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index d68096bc5252..d2e5cbca13c8 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,7 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -110,7 +110,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = virtiovf_pci_ioctl_get_region_info, + .get_region_info_caps = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, @@ -132,7 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 160bc2e31ece..e74f94c17fbe 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -116,7 +116,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, -- cgit v1.2.3 From 56c069307dfd0a5e39b685e0aeee6c40d1d7ddfc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:38 -0400 Subject: vfio: Remove the get_region_info op No driver uses it now, all are using get_region_info_caps(). Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/22-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 50 +++++++++++++++++++++--------------------------- include/linux/vfio.h | 2 -- 2 files changed, 22 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 48d034aede46..b8fe1a75e48a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1267,42 +1267,36 @@ static long vfio_get_region_info(struct vfio_device *device, struct vfio_info_cap caps = {}; int ret; + if (unlikely(!device->ops->get_region_info_caps)) + return -EINVAL; + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; - if (device->ops->get_region_info_caps) { - ret = device->ops->get_region_info_caps(device, &info, &caps); - if (ret) - goto out_free; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, - caps.size)) { - ret = -EFAULT; - goto out_free; - } - info.cap_offset = sizeof(info); + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, caps.size)) { + ret = -EFAULT; + goto out_free; } + info.cap_offset = sizeof(info); } + } - if (copy_to_user(arg, &info, minsz)) { - ret = -EFAULT; - goto out_free; - } - } else if (device->ops->get_region_info) { - ret = device->ops->get_region_info(device, arg); - if (ret) - return ret; - } else { - return -EINVAL; + if (copy_to_user(arg, &info, minsz)){ + ret = -EFAULT; + goto out_free; } out_free: diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 6311ddc83770..8e1ddb48b9b5 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -133,8 +133,6 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); - int (*get_region_info)(struct vfio_device *vdev, - struct vfio_region_info __user *arg); int (*get_region_info_caps)(struct vfio_device *vdev, struct vfio_region_info *info, struct vfio_info_cap *caps); -- cgit v1.2.3 From 044b9f1a7f4f3d41563007d0762c83a7d7505ac0 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 12 Nov 2025 08:46:14 +0100 Subject: PCI/PTM: Enable only if device advertises relevant role We have a Switch Upstream Port (2b:00.0) that has a PTM Capability, but doesn't advertise support for any PTM roles: Capabilities: [220 v1] Precision Time Measurement PTMCap: Requester- Responder- Root- Linux enables PTM without looking into what roles it actually supports, and apparently the Port immediately sends PTM Requests even though it doesn't support the PTM Requester role. The messages include an invalid bus number, so the Root Port detects an ACS Violation (see the PCIe r7.0, sec 6.12.1.1, implementation note): pci 0000:2b:00.0: [8086:5786] type 01 class 0x060400 PCIe Switch Upstream Port pci 0000:2b:00.0: PTM enabled, 4ns granularity pcieport 0000:00:07.1: AER: Multiple Uncorrectable (Non-Fatal) error message received from 0000:00:07.1 pcieport 0000:00:07.1: PCIe Bus Error: severity=Uncorrectable (Non-Fatal), type=Transaction Layer, (Receiver ID) pcieport 0000:00:07.1: device [8086:e44f] error status/mask=00200000/00000000 pcieport 0000:00:07.1: [21] ACSViol (First) pcieport 0000:00:07.1: AER: TLP Header: 0x34000000 0x00000052 0x00000000 0x00000000 The TLP Header shows a 4 DW header, no data (001b) Msg with Local routing (1 0100b) with Requester ID 0x0000 and PTM Request code (0x52). Fix this by enabling PTM only if the following conditions are true (see sec 6.21.1 figure 6-21): - Endpoint must advertise PTM Requester Capable - Switch Upstream Port must advertise PTM Responder Capable - Root Port must advertise PTM Root Capable Signed-off-by: Mika Westerberg [bhelgaas: commit log, comments] Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20251112074614.1440266-1-mika.westerberg@linux.intel.com --- drivers/pci/pcie/ptm.c | 23 +++++++++++++++++++++++ include/linux/pci.h | 2 ++ 2 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 65e4b008be00..ed0f9691e7d1 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -81,6 +81,11 @@ void pci_ptm_init(struct pci_dev *dev) dev->ptm_granularity = 0; } + if (cap & PCI_PTM_CAP_RES) + dev->ptm_responder = 1; + if (cap & PCI_PTM_CAP_REQ) + dev->ptm_requester = 1; + if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM) pci_enable_ptm(dev, NULL); @@ -144,6 +149,24 @@ static int __pci_enable_ptm(struct pci_dev *dev) return -EINVAL; } + switch (pci_pcie_type(dev)) { + case PCI_EXP_TYPE_ROOT_PORT: + if (!dev->ptm_root) + return -EINVAL; + break; + case PCI_EXP_TYPE_UPSTREAM: + if (!dev->ptm_responder) + return -EINVAL; + break; + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_LEG_END: + if (!dev->ptm_requester) + return -EINVAL; + break; + default: + return -EINVAL; + } + pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, &ctrl); ctrl |= PCI_PTM_CTRL_ENABLE; diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..d5018cb5c331 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -500,6 +500,8 @@ struct pci_dev { #ifdef CONFIG_PCIE_PTM u16 ptm_cap; /* PTM Capability */ unsigned int ptm_root:1; + unsigned int ptm_responder:1; + unsigned int ptm_requester:1; unsigned int ptm_enabled:1; u8 ptm_granularity; #endif -- cgit v1.2.3 From 6276c67f2bc4aeaf350a7cf889c33c38b3330ea9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Nov 2025 09:39:44 -0800 Subject: x86: Restrict KVM-induced symbol exports to KVM modules where obvious/possible Extend KVM's export macro framework to provide EXPORT_SYMBOL_FOR_KVM(), and use the helper macro to export symbols for KVM throughout x86 if and only if KVM will build one or more modules, and only for those modules. To avoid unnecessary exports when CONFIG_KVM=m but kvm.ko will not be built (because no vendor modules are selected), let arch code #define EXPORT_SYMBOL_FOR_KVM to suppress/override the exports. Note, the set of symbols to restrict to KVM was generated by manual search and audit; any "misses" are due to human error, not some grand plan. Signed-off-by: Sean Christopherson Signed-off-by: Dave Hansen Acked-by: Kai Huang Tested-by: Kai Huang Link: https://patch.msgid.link/20251112173944.1380633-5-seanjc%40google.com --- arch/x86/entry/entry.S | 7 ++-- arch/x86/entry/entry_64.S | 3 +- arch/x86/entry/entry_64_fred.S | 3 +- arch/x86/events/amd/core.c | 5 +-- arch/x86/events/core.c | 7 ++-- arch/x86/events/intel/lbr.c | 3 +- arch/x86/events/intel/pt.c | 7 ++-- arch/x86/include/asm/kvm_types.h | 5 +++ arch/x86/kernel/apic/apic.c | 3 +- arch/x86/kernel/apic/apic_common.c | 3 +- arch/x86/kernel/cpu/amd.c | 4 +-- arch/x86/kernel/cpu/bugs.c | 17 +++++----- arch/x86/kernel/cpu/bus_lock.c | 3 +- arch/x86/kernel/cpu/common.c | 7 ++-- arch/x86/kernel/cpu/sgx/main.c | 3 +- arch/x86/kernel/cpu/sgx/virt.c | 5 +-- arch/x86/kernel/e820.c | 3 +- arch/x86/kernel/fpu/core.c | 21 ++++++------ arch/x86/kernel/fpu/xstate.c | 7 ++-- arch/x86/kernel/hw_breakpoint.c | 3 +- arch/x86/kernel/irq.c | 3 +- arch/x86/kernel/kvm.c | 5 +-- arch/x86/kernel/nmi.c | 5 ++- arch/x86/kernel/process_64.c | 5 ++- arch/x86/kernel/reboot.c | 5 +-- arch/x86/kernel/tsc.c | 1 + arch/x86/lib/cache-smp.c | 9 ++--- arch/x86/lib/msr.c | 5 +-- arch/x86/mm/pat/memtype.c | 3 +- arch/x86/mm/tlb.c | 5 +-- arch/x86/virt/vmx/tdx/tdx.c | 69 +++++++++++++++++++------------------- include/linux/kvm_types.h | 14 ++++++++ 32 files changed, 144 insertions(+), 104 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 8e9a0cc20a4a..1d723c5ae9dd 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -29,8 +30,7 @@ SYM_FUNC_START(write_ibpb) FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET RET SYM_FUNC_END(write_ibpb) -/* For KVM */ -EXPORT_SYMBOL_GPL(write_ibpb); +EXPORT_SYMBOL_FOR_KVM(write_ibpb); .popsection @@ -48,8 +48,7 @@ SYM_CODE_START_NOALIGN(x86_verw_sel) .word __KERNEL_DS .align L1_CACHE_BYTES, 0xcc SYM_CODE_END(x86_verw_sel); -/* For KVM */ -EXPORT_SYMBOL_GPL(x86_verw_sel); +EXPORT_SYMBOL_FOR_KVM(x86_verw_sel); .popsection diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ed04a968cc7d..f9983a1907bf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -19,6 +19,7 @@ * - idtentry: Define exception entry points. */ #include +#include #include #include #include @@ -1566,5 +1567,5 @@ SYM_FUNC_START(clear_bhb_loop) pop %rbp RET SYM_FUNC_END(clear_bhb_loop) -EXPORT_SYMBOL_GPL(clear_bhb_loop) +EXPORT_SYMBOL_FOR_KVM(clear_bhb_loop) STACK_FRAME_NON_STANDARD(clear_bhb_loop) diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index fafbd3e68cb8..894f7f16eb80 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -4,6 +4,7 @@ */ #include +#include #include #include @@ -146,5 +147,5 @@ SYM_FUNC_START(asm_fred_entry_from_kvm) RET SYM_FUNC_END(asm_fred_entry_from_kvm) -EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm); +EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm); #endif diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index b20661b8621d..2dd9afb8dd9d 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -1569,7 +1570,7 @@ void amd_pmu_enable_virt(void) /* Reload all events */ amd_pmu_reload_virt(); } -EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); +EXPORT_SYMBOL_FOR_KVM(amd_pmu_enable_virt); void amd_pmu_disable_virt(void) { @@ -1586,4 +1587,4 @@ void amd_pmu_disable_virt(void) /* Reload all events */ amd_pmu_reload_virt(); } -EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); +EXPORT_SYMBOL_FOR_KVM(amd_pmu_disable_virt); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 745caa6c15a3..b5e397fa0835 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -714,7 +715,7 @@ struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { return static_call(x86_pmu_guest_get_msrs)(nr, data); } -EXPORT_SYMBOL_GPL(perf_guest_get_msrs); +EXPORT_SYMBOL_FOR_KVM(perf_guest_get_msrs); /* * There may be PMI landing after enabled=0. The PMI hitting could be before or @@ -3106,7 +3107,7 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; } -EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); +EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability); u64 perf_get_hw_event_config(int hw_event) { @@ -3117,4 +3118,4 @@ u64 perf_get_hw_event_config(int hw_event) return 0; } -EXPORT_SYMBOL_GPL(perf_get_hw_event_config); +EXPORT_SYMBOL_FOR_KVM(perf_get_hw_event_config); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 7aa59966e7c3..72f2adcda7c6 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include @@ -1705,7 +1706,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->info = x86_pmu.lbr_info; lbr->has_callstack = x86_pmu_has_lbr_callstack(); } -EXPORT_SYMBOL_GPL(x86_perf_get_lbr); +EXPORT_SYMBOL_FOR_KVM(x86_perf_get_lbr); struct event_constraint vlbr_constraint = __EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR), diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index e8cf29d2b10c..44524a387c58 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -82,13 +83,13 @@ u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) return (c & cd->mask) >> shift; } -EXPORT_SYMBOL_GPL(intel_pt_validate_cap); +EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_cap); u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return intel_pt_validate_cap(pt_pmu.caps, cap); } -EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap); +EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_hw_cap); static ssize_t pt_cap_show(struct device *cdev, struct device_attribute *attr, @@ -1590,7 +1591,7 @@ void intel_pt_handle_vmx(int on) local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); +EXPORT_SYMBOL_FOR_KVM(intel_pt_handle_vmx); /* * PMU callbacks diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h index 23268a188e70..d7c704ed1be9 100644 --- a/arch/x86/include/asm/kvm_types.h +++ b/arch/x86/include/asm/kvm_types.h @@ -10,6 +10,11 @@ #define KVM_SUB_MODULES kvm-intel #else #undef KVM_SUB_MODULES +/* + * Don't export symbols for KVM without vendor modules, as kvm.ko is built iff + * at least one vendor module is enabled. + */ +#define EXPORT_SYMBOL_FOR_KVM(symbol) #endif #define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 680d305589a3..dcf4dc7a9eac 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -2316,7 +2317,7 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) dest |= msg->arch_addr_hi.destid_8_31 << 8; return dest; } -EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); +EXPORT_SYMBOL_FOR_KVM(x86_msi_msg_get_destid); static void __init apic_bsp_up_setup(void) { diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 9ef3be866832..2ed3b5c88c7f 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -4,6 +4,7 @@ * SPDX-License-Identifier: GPL-2.0 */ #include +#include #include #include "local.h" @@ -25,7 +26,7 @@ u32 default_cpu_present_to_apicid(int mps_cpu) else return BAD_APICID; } -EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); +EXPORT_SYMBOL_FOR_KVM(default_cpu_present_to_apicid); /* * Set up the logical destination ID when the APIC operates in logical diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8e36964a7721..69a3c02cab48 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -3,7 +3,7 @@ #include #include #include - +#include #include #include #include @@ -1310,7 +1310,7 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr) return per_cpu(amd_dr_addr_mask[dr], smp_processor_id()); } -EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); +EXPORT_SYMBOL_FOR_KVM(amd_get_dr_addr_mask); static void zenbleed_check_cpu(void *unused) { diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 57c1d0ed36a5..d11a7655994e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -178,7 +179,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); /* Control IBPB on vCPU load */ DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb); -EXPORT_SYMBOL_GPL(switch_vcpu_ibpb); +EXPORT_SYMBOL_FOR_KVM(switch_vcpu_ibpb); /* Control CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear); @@ -197,7 +198,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); * mitigation is required. */ DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); -EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); +EXPORT_SYMBOL_FOR_KVM(cpu_buf_vm_clear); #undef pr_fmt #define pr_fmt(fmt) "mitigations: " fmt @@ -365,7 +366,7 @@ x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) speculation_ctrl_update(tif); } } -EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); +EXPORT_SYMBOL_FOR_KVM(x86_virt_spec_ctrl); static void x86_amd_ssb_disable(void) { @@ -1031,7 +1032,7 @@ bool gds_ucode_mitigated(void) return (gds_mitigation == GDS_MITIGATION_FULL || gds_mitigation == GDS_MITIGATION_FULL_LOCKED); } -EXPORT_SYMBOL_GPL(gds_ucode_mitigated); +EXPORT_SYMBOL_FOR_KVM(gds_ucode_mitigated); void update_gds_msr(void) { @@ -2858,7 +2859,7 @@ void x86_spec_ctrl_setup_ap(void) } bool itlb_multihit_kvm_mitigation; -EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); +EXPORT_SYMBOL_FOR_KVM(itlb_multihit_kvm_mitigation); #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt @@ -2866,11 +2867,9 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); /* Default mitigation for L1TF-affected CPUs */ enum l1tf_mitigations l1tf_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF; -#if IS_ENABLED(CONFIG_KVM_INTEL) -EXPORT_SYMBOL_GPL(l1tf_mitigation); -#endif +EXPORT_SYMBOL_FOR_KVM(l1tf_mitigation); enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; -EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); +EXPORT_SYMBOL_FOR_KVM(l1tf_vmx_mitigation); /* * These CPUs all support 44bits physical address space internally in the diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c index 981f8b1f0792..dbc99a47be45 100644 --- a/arch/x86/kernel/cpu/bus_lock.c +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -289,7 +290,7 @@ bool handle_guest_split_lock(unsigned long ip) force_sig_fault(SIGBUS, BUS_ADRALN, NULL); return false; } -EXPORT_SYMBOL_GPL(handle_guest_split_lock); +EXPORT_SYMBOL_FOR_KVM(handle_guest_split_lock); void bus_lock_init(void) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c7d3512914ca..71bb04e6a5bc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -460,14 +461,14 @@ void cr4_update_irqsoff(unsigned long set, unsigned long clear) __write_cr4(newval); } } -EXPORT_SYMBOL(cr4_update_irqsoff); +EXPORT_SYMBOL_FOR_KVM(cr4_update_irqsoff); /* Read the CR4 shadow. */ unsigned long cr4_read_shadow(void) { return this_cpu_read(cpu_tlbstate.cr4); } -EXPORT_SYMBOL_GPL(cr4_read_shadow); +EXPORT_SYMBOL_FOR_KVM(cr4_read_shadow); void cr4_init(void) { @@ -722,7 +723,7 @@ void load_direct_gdt(int cpu) gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); } -EXPORT_SYMBOL_GPL(load_direct_gdt); +EXPORT_SYMBOL_FOR_KVM(load_direct_gdt); /* Load a fixmap remapping of the per-cpu GDT */ void load_fixmap_gdt(int cpu) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 2de01b379aa3..fc8fb64d62f4 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -915,7 +916,7 @@ int sgx_set_attribute(unsigned long *allowed_attributes, *allowed_attributes |= SGX_ATTR_PROVISIONKEY; return 0; } -EXPORT_SYMBOL_GPL(sgx_set_attribute); +EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute); static int __init sgx_init(void) { diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 7aaa3652e31d..727f2570c8b9 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -5,6 +5,7 @@ * Copyright(c) 2021 Intel Corporation. */ +#include #include #include #include @@ -363,7 +364,7 @@ int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, WARN_ON_ONCE(ret); return 0; } -EXPORT_SYMBOL_GPL(sgx_virt_ecreate); +EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate); static int __sgx_virt_einit(void __user *sigstruct, void __user *token, void __user *secs) @@ -432,4 +433,4 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token, return ret; } -EXPORT_SYMBOL_GPL(sgx_virt_einit); +EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c3acbd26408b..b15b97d3cb52 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -95,7 +96,7 @@ bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type) { return _e820__mapped_any(e820_table_firmware, start, end, type); } -EXPORT_SYMBOL_GPL(e820__mapped_raw_any); +EXPORT_SYMBOL_FOR_KVM(e820__mapped_raw_any); bool e820__mapped_any(u64 start, u64 end, enum e820_type type) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e88eacb1b5bb..da233f20ae6f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -276,7 +277,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) return true; } -EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); +EXPORT_SYMBOL_FOR_KVM(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { @@ -291,7 +292,7 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu) gfpu->fpstate = NULL; vfree(fpstate); } -EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); +EXPORT_SYMBOL_FOR_KVM(fpu_free_guest_fpstate); /* * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable @@ -313,7 +314,7 @@ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) return __xfd_enable_feature(xfeatures, guest_fpu); } -EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); +EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features); #ifdef CONFIG_X86_64 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) @@ -324,7 +325,7 @@ void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) xfd_update_state(guest_fpu->fpstate); fpregs_unlock(); } -EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); +EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd); /** * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state @@ -348,7 +349,7 @@ void fpu_sync_guest_vmexit_xfd_state(void) __this_cpu_write(xfd_state, fpstate->xfd); } } -EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); +EXPORT_SYMBOL_FOR_KVM(fpu_sync_guest_vmexit_xfd_state); #endif /* CONFIG_X86_64 */ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) @@ -390,7 +391,7 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) fpregs_unlock(); return 0; } -EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); +EXPORT_SYMBOL_FOR_KVM(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u64 xfeatures, u32 pkru) @@ -409,7 +410,7 @@ void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; } } -EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); +EXPORT_SYMBOL_FOR_KVM(fpu_copy_guest_fpstate_to_uabi); int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru) @@ -439,7 +440,7 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } -EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); +EXPORT_SYMBOL_FOR_KVM(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) @@ -857,7 +858,7 @@ void switch_fpu_return(void) fpregs_restore_userregs(); } -EXPORT_SYMBOL_GPL(switch_fpu_return); +EXPORT_SYMBOL_FOR_KVM(switch_fpu_return); void fpregs_lock_and_load(void) { @@ -892,7 +893,7 @@ void fpregs_assert_state_consistent(void) WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); } -EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); +EXPORT_SYMBOL_FOR_KVM(fpregs_assert_state_consistent); #endif void fpregs_mark_activate(void) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 28e4fd65c9da..48113c5193aa 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1058,7 +1059,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) return __raw_xsave_addr(xsave, xfeature_nr); } -EXPORT_SYMBOL_GPL(get_xsave_addr); +EXPORT_SYMBOL_FOR_KVM(get_xsave_addr); /* * Given an xstate feature nr, calculate where in the xsave buffer the state is. @@ -1482,7 +1483,7 @@ void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeatu if (addr) memset(addr, 0, xstate_sizes[xfeature]); } -EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); +EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component); #endif #ifdef CONFIG_X86_64 @@ -1818,7 +1819,7 @@ u64 xstate_get_guest_group_perm(void) { return xstate_get_group_perm(true); } -EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); +EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm); /** * fpu_xstate_prctl - xstate permission operations diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index b01644c949b2..f846c15f21ca 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -489,7 +490,7 @@ void hw_breakpoint_restore(void) set_debugreg(DR6_RESERVED, 6); set_debugreg(__this_cpu_read(cpu_dr7), 7); } -EXPORT_SYMBOL_GPL(hw_breakpoint_restore); +EXPORT_SYMBOL_FOR_KVM(hw_breakpoint_restore); /* * Handle debug exception notifications. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 10721a125226..86f4e574de02 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -361,7 +362,7 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) synchronize_rcu(); } } -EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); +EXPORT_SYMBOL_FOR_KVM(kvm_set_posted_intr_wakeup_handler); /* * Handler for POSTED_INTERRUPT_VECTOR. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b67d7c59dca0..204765004c72 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -162,7 +163,7 @@ void kvm_async_pf_task_wait_schedule(u32 token) } finish_swait(&n.wq, &wait); } -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); +EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { @@ -253,7 +254,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void) return flags; } -EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); +EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags); noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index be93ec7255bf..3d239ed12744 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -613,9 +614,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx) { exc_nmi(regs); } -#if IS_MODULE(CONFIG_KVM_INTEL) -EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx); -#endif +EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx); #endif #ifdef CONFIG_NMI_CHECK_CPU diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 52a5c03c353c..432c0a004c60 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -303,9 +304,7 @@ void current_save_fsgs(void) save_fsgs(current); local_irq_restore(flags); } -#if IS_ENABLED(CONFIG_KVM) -EXPORT_SYMBOL_GPL(current_save_fsgs); -#endif +EXPORT_SYMBOL_FOR_KVM(current_save_fsgs); static __always_inline void loadseg(enum which_selector which, unsigned short sel) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 964f6b0a3d68..6032fa9ec753 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -541,7 +542,7 @@ void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) rcu_assign_pointer(cpu_emergency_virt_callback, callback); } -EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); +EXPORT_SYMBOL_FOR_KVM(cpu_emergency_register_virt_callback); void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) { @@ -551,7 +552,7 @@ void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) rcu_assign_pointer(cpu_emergency_virt_callback, NULL); synchronize_rcu(); } -EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); +EXPORT_SYMBOL_FOR_KVM(cpu_emergency_unregister_virt_callback); /* * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 87e749106dda..7d3e13e14eab 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c index c5c60d07308c..824664c0ecbd 100644 --- a/arch/x86/lib/cache-smp.c +++ b/arch/x86/lib/cache-smp.c @@ -2,6 +2,7 @@ #include #include #include +#include static void __wbinvd(void *dummy) { @@ -12,7 +13,7 @@ void wbinvd_on_cpu(int cpu) { smp_call_function_single(cpu, __wbinvd, NULL, 1); } -EXPORT_SYMBOL(wbinvd_on_cpu); +EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpu); void wbinvd_on_all_cpus(void) { @@ -24,7 +25,7 @@ void wbinvd_on_cpus_mask(struct cpumask *cpus) { on_each_cpu_mask(cpus, __wbinvd, NULL, 1); } -EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask); +EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpus_mask); static void __wbnoinvd(void *dummy) { @@ -35,10 +36,10 @@ void wbnoinvd_on_all_cpus(void) { on_each_cpu(__wbnoinvd, NULL, 1); } -EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus); +EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_all_cpus); void wbnoinvd_on_cpus_mask(struct cpumask *cpus) { on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1); } -EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask); +EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_cpus_mask); diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 4ef7c6dcbea6..dfdd1da89f36 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include @@ -103,7 +104,7 @@ int msr_set_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, true); } -EXPORT_SYMBOL_GPL(msr_set_bit); +EXPORT_SYMBOL_FOR_KVM(msr_set_bit); /** * msr_clear_bit - Clear @bit in a MSR @msr. @@ -119,7 +120,7 @@ int msr_clear_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, false); } -EXPORT_SYMBOL_GPL(msr_clear_bit); +EXPORT_SYMBOL_FOR_KVM(msr_clear_bit); #ifdef CONFIG_TRACEPOINTS void do_trace_write_msr(u32 msr, u64 val, int failed) diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index b68200a0e0c6..8a3d9722f602 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -697,7 +698,7 @@ bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) cm == _PAGE_CACHE_MODE_UC_MINUS || cm == _PAGE_CACHE_MODE_WC; } -EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); +EXPORT_SYMBOL_FOR_KVM(pat_pfn_immune_to_uc_mtrr); /** * memtype_reserve_io - Request a memory type mapping for a region of memory diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5d221709353e..f5b93e01e347 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -1582,7 +1583,7 @@ unsigned long __get_current_cr3_fast(void) VM_BUG_ON(cr3 != __read_cr3()); return cr3; } -EXPORT_SYMBOL_GPL(__get_current_cr3_fast); +EXPORT_SYMBOL_FOR_KVM(__get_current_cr3_fast); /* * Flush one page in the kernel mapping @@ -1723,7 +1724,7 @@ void __flush_tlb_all(void) flush_tlb_local(); } } -EXPORT_SYMBOL_GPL(__flush_tlb_all); +EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all); void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index eac403248462..5ce4ebe99774 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -181,7 +182,7 @@ int tdx_cpu_enable(void) return 0; } -EXPORT_SYMBOL_GPL(tdx_cpu_enable); +EXPORT_SYMBOL_FOR_KVM(tdx_cpu_enable); /* * Add a memory region as a TDX memory block. The caller must make sure @@ -662,7 +663,7 @@ void tdx_quirk_reset_page(struct page *page) { tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE); } -EXPORT_SYMBOL_GPL(tdx_quirk_reset_page); +EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page); static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) { @@ -1216,7 +1217,7 @@ int tdx_enable(void) return ret; } -EXPORT_SYMBOL_GPL(tdx_enable); +EXPORT_SYMBOL_FOR_KVM(tdx_enable); static bool is_pamt_page(unsigned long phys) { @@ -1477,13 +1478,13 @@ const struct tdx_sys_info *tdx_get_sysinfo(void) return p; } -EXPORT_SYMBOL_GPL(tdx_get_sysinfo); +EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo); u32 tdx_get_nr_guest_keyids(void) { return tdx_nr_guest_keyids; } -EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids); +EXPORT_SYMBOL_FOR_KVM(tdx_get_nr_guest_keyids); int tdx_guest_keyid_alloc(void) { @@ -1491,13 +1492,13 @@ int tdx_guest_keyid_alloc(void) tdx_guest_keyid_start + tdx_nr_guest_keyids - 1, GFP_KERNEL); } -EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc); +EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_alloc); void tdx_guest_keyid_free(unsigned int keyid) { ida_free(&tdx_guest_keyid_pool, keyid); } -EXPORT_SYMBOL_GPL(tdx_guest_keyid_free); +EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_free); static inline u64 tdx_tdr_pa(struct tdx_td *td) { @@ -1521,7 +1522,7 @@ noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args); } -EXPORT_SYMBOL_GPL(tdh_vp_enter); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_enter); u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) { @@ -1533,7 +1534,7 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) tdx_clflush_page(tdcs_page); return seamcall(TDH_MNG_ADDCX, &args); } -EXPORT_SYMBOL_GPL(tdh_mng_addcx); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx); u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) { @@ -1553,7 +1554,7 @@ u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page return ret; } -EXPORT_SYMBOL_GPL(tdh_mem_page_add); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_add); u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) { @@ -1572,7 +1573,7 @@ u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u return ret; } -EXPORT_SYMBOL_GPL(tdh_mem_sept_add); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_sept_add); u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) { @@ -1584,7 +1585,7 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) tdx_clflush_page(tdcx_page); return seamcall(TDH_VP_ADDCX, &args); } -EXPORT_SYMBOL_GPL(tdh_vp_addcx); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx); u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) { @@ -1603,7 +1604,7 @@ u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u return ret; } -EXPORT_SYMBOL_GPL(tdh_mem_page_aug); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_aug); u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) { @@ -1620,7 +1621,7 @@ u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u6 return ret; } -EXPORT_SYMBOL_GPL(tdh_mem_range_block); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_range_block); u64 tdh_mng_key_config(struct tdx_td *td) { @@ -1630,7 +1631,7 @@ u64 tdh_mng_key_config(struct tdx_td *td) return seamcall(TDH_MNG_KEY_CONFIG, &args); } -EXPORT_SYMBOL_GPL(tdh_mng_key_config); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_config); u64 tdh_mng_create(struct tdx_td *td, u16 hkid) { @@ -1642,7 +1643,7 @@ u64 tdh_mng_create(struct tdx_td *td, u16 hkid) tdx_clflush_page(td->tdr_page); return seamcall(TDH_MNG_CREATE, &args); } -EXPORT_SYMBOL_GPL(tdh_mng_create); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_create); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) { @@ -1654,7 +1655,7 @@ u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) tdx_clflush_page(vp->tdvpr_page); return seamcall(TDH_VP_CREATE, &args); } -EXPORT_SYMBOL_GPL(tdh_vp_create); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_create); u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) { @@ -1671,7 +1672,7 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) return ret; } -EXPORT_SYMBOL_GPL(tdh_mng_rd); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_rd); u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) { @@ -1688,7 +1689,7 @@ u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) return ret; } -EXPORT_SYMBOL_GPL(tdh_mr_extend); +EXPORT_SYMBOL_FOR_KVM(tdh_mr_extend); u64 tdh_mr_finalize(struct tdx_td *td) { @@ -1698,7 +1699,7 @@ u64 tdh_mr_finalize(struct tdx_td *td) return seamcall(TDH_MR_FINALIZE, &args); } -EXPORT_SYMBOL_GPL(tdh_mr_finalize); +EXPORT_SYMBOL_FOR_KVM(tdh_mr_finalize); u64 tdh_vp_flush(struct tdx_vp *vp) { @@ -1708,7 +1709,7 @@ u64 tdh_vp_flush(struct tdx_vp *vp) return seamcall(TDH_VP_FLUSH, &args); } -EXPORT_SYMBOL_GPL(tdh_vp_flush); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_flush); u64 tdh_mng_vpflushdone(struct tdx_td *td) { @@ -1718,7 +1719,7 @@ u64 tdh_mng_vpflushdone(struct tdx_td *td) return seamcall(TDH_MNG_VPFLUSHDONE, &args); } -EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_vpflushdone); u64 tdh_mng_key_freeid(struct tdx_td *td) { @@ -1728,7 +1729,7 @@ u64 tdh_mng_key_freeid(struct tdx_td *td) return seamcall(TDH_MNG_KEY_FREEID, &args); } -EXPORT_SYMBOL_GPL(tdh_mng_key_freeid); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_freeid); u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) { @@ -1744,7 +1745,7 @@ u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) return ret; } -EXPORT_SYMBOL_GPL(tdh_mng_init); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_init); u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) { @@ -1761,7 +1762,7 @@ u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) return ret; } -EXPORT_SYMBOL_GPL(tdh_vp_rd); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_rd); u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) { @@ -1774,7 +1775,7 @@ u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) return seamcall(TDH_VP_WR, &args); } -EXPORT_SYMBOL_GPL(tdh_vp_wr); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_wr); u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) { @@ -1787,7 +1788,7 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) /* apicid requires version == 1. */ return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args); } -EXPORT_SYMBOL_GPL(tdh_vp_init); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_init); /* * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats. @@ -1809,7 +1810,7 @@ u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 return ret; } -EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); +EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_reclaim); u64 tdh_mem_track(struct tdx_td *td) { @@ -1819,7 +1820,7 @@ u64 tdh_mem_track(struct tdx_td *td) return seamcall(TDH_MEM_TRACK, &args); } -EXPORT_SYMBOL_GPL(tdh_mem_track); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_track); u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) { @@ -1836,7 +1837,7 @@ u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u6 return ret; } -EXPORT_SYMBOL_GPL(tdh_mem_page_remove); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_remove); u64 tdh_phymem_cache_wb(bool resume) { @@ -1846,7 +1847,7 @@ u64 tdh_phymem_cache_wb(bool resume) return seamcall(TDH_PHYMEM_CACHE_WB, &args); } -EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb); +EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb); u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) { @@ -1856,7 +1857,7 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } -EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); +EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr); u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) { @@ -1866,7 +1867,7 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } -EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); +EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid); #ifdef CONFIG_KEXEC_CORE void tdx_cpu_flush_cache_for_kexec(void) @@ -1884,5 +1885,5 @@ void tdx_cpu_flush_cache_for_kexec(void) wbinvd(); this_cpu_write(cache_state_incoherent, false); } -EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec); +EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec); #endif diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 490464c205b4..a568d8e6f4e8 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -11,8 +11,22 @@ #ifdef KVM_SUB_MODULES #define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) \ EXPORT_SYMBOL_FOR_MODULES(symbol, __stringify(KVM_SUB_MODULES)) +#define EXPORT_SYMBOL_FOR_KVM(symbol) \ + EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm," __stringify(KVM_SUB_MODULES)) #else #define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) +/* + * Allow architectures to provide a custom EXPORT_SYMBOL_FOR_KVM, but only + * if there are no submodules, e.g. to allow suppressing exports if KVM=m, but + * kvm.ko won't actually be built (due to lack of at least one submodule). + */ +#ifndef EXPORT_SYMBOL_FOR_KVM +#if IS_MODULE(CONFIG_KVM) +#define EXPORT_SYMBOL_FOR_KVM(symbol) EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm") +#else +#define EXPORT_SYMBOL_FOR_KVM(symbol) +#endif /* IS_MODULE(CONFIG_KVM) */ +#endif /* EXPORT_SYMBOL_FOR_KVM */ #endif #ifndef __ASSEMBLER__ -- cgit v1.2.3 From 4495bffd86ba0fdabfaef0c41d12f68ec2a1e05b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 10 Nov 2025 16:22:25 -0600 Subject: PCI/ASPM: Cache L0s/L1 Supported so advertised link states can be overridden Defective devices sometimes advertise support for ASPM L0s or L1 states even if they don't work correctly. Cache the L0s Supported and L1 Supported bits early in enumeration so HEADER quirks can override the ASPM states advertised in Link Capabilities before pcie_aspm_cap_init() enables ASPM. Signed-off-by: Bjorn Helgaas Tested-by: Shawn Lin Reviewed-by: Lukas Wunner Link: https://patch.msgid.link/20251110222929.2140564-2-helgaas@kernel.org --- drivers/pci/pcie/aspm.c | 12 ++++-------- drivers/pci/probe.c | 7 +++++++ include/linux/pci.h | 2 ++ 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 7cc8281e7011..15d50c089070 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -830,7 +830,6 @@ static void pcie_aspm_override_default_link_state(struct pcie_link_state *link) static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) { struct pci_dev *child = link->downstream, *parent = link->pdev; - u32 parent_lnkcap, child_lnkcap; u16 parent_lnkctl, child_lnkctl; struct pci_bus *linkbus = parent->subordinate; @@ -845,9 +844,8 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * If ASPM not supported, don't mess with the clocks and link, * bail out now. */ - pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap); - pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap); - if (!(parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPMS)) + if (!(parent->aspm_l0s_support && child->aspm_l0s_support) && + !(parent->aspm_l1_support && child->aspm_l1_support)) return; /* Configure common clock before checking latencies */ @@ -859,8 +857,6 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * read-only Link Capabilities may change depending on common clock * configuration (PCIe r5.0, sec 7.5.3.6). */ - pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap); - pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap); pcie_capability_read_word(parent, PCI_EXP_LNKCTL, &parent_lnkctl); pcie_capability_read_word(child, PCI_EXP_LNKCTL, &child_lnkctl); @@ -880,7 +876,7 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * given link unless components on both sides of the link each * support L0s. */ - if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L0S) + if (parent->aspm_l0s_support && child->aspm_l0s_support) link->aspm_support |= PCIE_LINK_STATE_L0S; if (child_lnkctl & PCI_EXP_LNKCTL_ASPM_L0S) @@ -889,7 +885,7 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) link->aspm_enabled |= PCIE_LINK_STATE_L0S_DW; /* Setup L1 state */ - if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L1) + if (parent->aspm_l1_support && child->aspm_l1_support) link->aspm_support |= PCIE_LINK_STATE_L1; if (parent_lnkctl & child_lnkctl & PCI_EXP_LNKCTL_ASPM_L1) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index c83e75a0ec12..de72ceaea285 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1663,6 +1663,13 @@ void set_pcie_port_type(struct pci_dev *pdev) if (reg32 & PCI_EXP_LNKCAP_DLLLARC) pdev->link_active_reporting = 1; +#ifdef CONFIG_PCIEASPM + if (reg32 & PCI_EXP_LNKCAP_ASPM_L0S) + pdev->aspm_l0s_support = 1; + if (reg32 & PCI_EXP_LNKCAP_ASPM_L1) + pdev->aspm_l1_support = 1; +#endif + parent = pci_upstream_bridge(pdev); if (!parent) return; diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..bf97d49c23cf 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -412,6 +412,8 @@ struct pci_dev { u16 l1ss; /* L1SS Capability pointer */ #ifdef CONFIG_PCIEASPM struct pcie_link_state *link_state; /* ASPM link state */ + unsigned int aspm_l0s_support:1; /* ASPM L0s support */ + unsigned int aspm_l1_support:1; /* ASPM L1 support */ unsigned int ltr_path:1; /* Latency Tolerance Reporting supported from root to here */ #endif -- cgit v1.2.3 From 2bcd3800f2da1be13b972858f63c66d035b1ec6d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:15 +0000 Subject: slab: Reimplement page_slab() In order to separate slabs from folios, we need to convert from any page in a slab to the slab directly without going through a page to folio conversion first. Up to this point, page_slab() has followed the example of other memdesc converters (page_folio(), page_ptdesc() etc) and just cast the pointer to the requested type, regardless of whether the pointer is actually a pointer to the correct type or not. That changes with this commit; we check that the page actually belongs to a slab and return NULL if it does not. Other memdesc converters will adopt this convention in future. kfence was the only user of page_slab(), so adjust it to the new way of working. It will need to be touched again when we separate slab from page. Signed-off-by: Matthew Wilcox (Oracle) Cc: Alexander Potapenko Cc: Marco Elver Cc: kasan-dev@googlegroups.com Link: https://patch.msgid.link/20251113000932.1589073-2-willy@infradead.org Acked-by: David Hildenbrand (Red Hat) Tested-by: Marco Elver Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/page-flags.h | 14 +------------- mm/kfence/core.c | 14 ++++++++------ mm/slab.h | 28 ++++++++++++++++------------ 3 files changed, 25 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0091ad1986bf..6d5e44968eab 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1048,19 +1048,7 @@ PAGE_TYPE_OPS(Table, table, pgtable) */ PAGE_TYPE_OPS(Guard, guard, guard) -FOLIO_TYPE_OPS(slab, slab) - -/** - * PageSlab - Determine if the page belongs to the slab allocator - * @page: The page to test. - * - * Context: Any context. - * Return: True for slab pages, false for any other kind of page. - */ -static inline bool PageSlab(const struct page *page) -{ - return folio_test_slab(page_folio(page)); -} +PAGE_TYPE_OPS(Slab, slab, slab) #ifdef CONFIG_HUGETLB_PAGE FOLIO_TYPE_OPS(hugetlb, hugetlb) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 727c20c94ac5..e62b5516bf48 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -612,14 +612,15 @@ static unsigned long kfence_init_pool(void) * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab; + struct page *page; if (!i || (i % 2)) continue; - slab = page_slab(pfn_to_page(start_pfn + i)); - __folio_set_slab(slab_folio(slab)); + page = pfn_to_page(start_pfn + i); + __SetPageSlab(page); #ifdef CONFIG_MEMCG + struct slab *slab = page_slab(page); slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | MEMCG_DATA_OBJEXTS; #endif @@ -665,16 +666,17 @@ static unsigned long kfence_init_pool(void) reset_slab: for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab; + struct page *page; if (!i || (i % 2)) continue; - slab = page_slab(pfn_to_page(start_pfn + i)); + page = pfn_to_page(start_pfn + i); #ifdef CONFIG_MEMCG + struct slab *slab = page_slab(page); slab->obj_exts = 0; #endif - __folio_clear_slab(slab_folio(slab)); + __ClearPageSlab(page); } return addr; diff --git a/mm/slab.h b/mm/slab.h index 078daecc7cf5..a64b9b2c8731 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -146,20 +146,24 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) struct slab *: (struct folio *)s)) /** - * page_slab - Converts from first struct page to slab. - * @p: The first (either head of compound or single) page of slab. + * page_slab - Converts from struct page to its slab. + * @page: A page which may or may not belong to a slab. * - * A temporary wrapper to convert struct page to struct slab in situations where - * we know the page is the compound head, or single order-0 page. - * - * Long-term ideally everything would work with struct slab directly or go - * through folio to struct slab. - * - * Return: The slab which contains this page + * Return: The slab which contains this page or NULL if the page does + * not belong to a slab. This includes pages returned from large kmalloc. */ -#define page_slab(p) (_Generic((p), \ - const struct page *: (const struct slab *)(p), \ - struct page *: (struct slab *)(p))) +static inline struct slab *page_slab(const struct page *page) +{ + unsigned long head; + + head = READ_ONCE(page->compound_head); + if (head & 1) + page = (struct page *)(head - 1); + if (data_race(page->page_type >> 24) != PGTY_slab) + page = NULL; + + return (struct slab *)page; +} /** * slab_page - The first struct page allocated for a slab -- cgit v1.2.3 From ee1ee8abc4197e21594ca29348629ccbfff4daec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:16 +0000 Subject: slab: Remove folio references from __ksize() In the future, we will separate slab, folio and page from each other and calling virt_to_folio() on an address allocated from slab will return NULL. Delay the conversion from struct page to struct slab until we know we're not dealing with a large kmalloc allocation. There's a minor win for large kmalloc allocations as we avoid the compound_head() hidden in virt_to_folio(). This deprecates calling ksize() on memory allocated by alloc_pages(). Today it becomes a warning and support will be removed entirely in the future. Introduce large_kmalloc_size() to abstract how we represent the size of a large kmalloc allocation. For now, this is the same as page_size(), but it will change with separately allocated memdescs. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251113000932.1589073-3-willy@infradead.org Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/page-flags.h | 2 +- mm/slab.h | 10 ++++++++++ mm/slab_common.c | 23 ++++++++++++----------- 3 files changed, 23 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6d5e44968eab..f7a0e4af0c73 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1064,7 +1064,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) -FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) +PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs diff --git a/mm/slab.h b/mm/slab.h index a64b9b2c8731..31ccf0f6d3a1 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -605,6 +605,16 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->size; } +static inline unsigned int large_kmalloc_order(const struct page *page) +{ + return page[1].flags.f & 0xff; +} + +static inline size_t large_kmalloc_size(const struct page *page) +{ + return PAGE_SIZE << large_kmalloc_order(page); +} + #ifdef CONFIG_SLUB_DEBUG void dump_unreclaimable_slab(void); #else diff --git a/mm/slab_common.c b/mm/slab_common.c index 932d13ada36c..67ad2328276e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -997,26 +997,27 @@ void __init create_kmalloc_caches(void) */ size_t __ksize(const void *object) { - struct folio *folio; + const struct page *page; + const struct slab *slab; if (unlikely(object == ZERO_SIZE_PTR)) return 0; - folio = virt_to_folio(object); + page = virt_to_page(object); - if (unlikely(!folio_test_slab(folio))) { - if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) - return 0; - if (WARN_ON(object != folio_address(folio))) - return 0; - return folio_size(folio); - } + if (unlikely(PageLargeKmalloc(page))) + return large_kmalloc_size(page); + + slab = page_slab(page); + /* Delete this after we're sure there are no users */ + if (WARN_ON(!slab)) + return page_size(page); #ifdef CONFIG_SLUB_DEBUG - skip_orig_size_check(folio_slab(folio)->slab_cache, object); + skip_orig_size_check(slab->slab_cache, object); #endif - return slab_ksize(folio_slab(folio)->slab_cache); + return slab_ksize(slab->slab_cache); } gfp_t kmalloc_fix_flags(gfp_t flags) -- cgit v1.2.3 From 4f49088c162579a4ed049c555fe0cd188fd928c4 Mon Sep 17 00:00:00 2001 From: Khairul Anuar Romli Date: Wed, 8 Oct 2025 17:09:05 +0800 Subject: firmware: stratix10-svc: Add definition for voltage and temperature sensor Add entry in Stratix 10 Service Layer to support temperature and voltage sensor. Signed-off-by: Khairul Anuar Romli Signed-off-by: Dinh Nguyen --- drivers/firmware/stratix10-svc.c | 21 +++++++++++-- include/linux/firmware/intel/stratix10-smc.h | 34 ++++++++++++++++++++++ .../linux/firmware/intel/stratix10-svc-client.h | 8 ++++- 3 files changed, 60 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c index e3f990d888d7..5a32c1054bee 100644 --- a/drivers/firmware/stratix10-svc.c +++ b/drivers/firmware/stratix10-svc.c @@ -34,7 +34,7 @@ * timeout is set to 30 seconds (30 * 1000) at Intel Stratix10 SoC. */ #define SVC_NUM_DATA_IN_FIFO 32 -#define SVC_NUM_CHANNEL 3 +#define SVC_NUM_CHANNEL 4 #define FPGA_CONFIG_DATA_CLAIM_TIMEOUT_MS 200 #define FPGA_CONFIG_STATUS_TIMEOUT_SEC 30 #define BYTE_TO_WORD_SIZE 4 @@ -341,6 +341,8 @@ static void svc_thread_recv_status_ok(struct stratix10_svc_data *p_data, case COMMAND_RSU_MAX_RETRY: case COMMAND_RSU_DCMF_STATUS: case COMMAND_FIRMWARE_VERSION: + case COMMAND_HWMON_READTEMP: + case COMMAND_HWMON_READVOLT: cb_data->status = BIT(SVC_STATUS_OK); cb_data->kaddr1 = &res.a1; break; @@ -525,7 +527,17 @@ static int svc_normal_to_secure_thread(void *data) a1 = (unsigned long)pdata->paddr; a2 = 0; break; - + /* for HWMON */ + case COMMAND_HWMON_READTEMP: + a0 = INTEL_SIP_SMC_HWMON_READTEMP; + a1 = pdata->arg[0]; + a2 = 0; + break; + case COMMAND_HWMON_READVOLT: + a0 = INTEL_SIP_SMC_HWMON_READVOLT; + a1 = pdata->arg[0]; + a2 = 0; + break; /* for polling */ case COMMAND_POLL_SERVICE_STATUS: a0 = INTEL_SIP_SMC_SERVICE_COMPLETED; @@ -1197,6 +1209,11 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev) chans[2].name = SVC_CLIENT_FCS; spin_lock_init(&chans[2].lock); + chans[3].scl = NULL; + chans[3].ctrl = controller; + chans[3].name = SVC_CLIENT_HWMON; + spin_lock_init(&chans[3].lock); + list_add_tail(&controller->node, &svc_ctrl); platform_set_drvdata(pdev, controller); diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index ee80ca4bb0d0..7306dd243b2a 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -620,4 +620,38 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) #define INTEL_SIP_SMC_FCS_GET_PROVISION_DATA \ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FCS_GET_PROVISION_DATA) +/** + * Request INTEL_SIP_SMC_HWMON_READTEMP + * Sync call to request temperature + * + * Call register usage: + * a0 Temperature Channel + * a1-a7 not used + * + * Return status + * a0 INTEL_SIP_SMC_STATUS_OK + * a1 Temperature Value + * a2-a3 not used + */ +#define INTEL_SIP_SMC_FUNCID_HWMON_READTEMP 32 +#define INTEL_SIP_SMC_HWMON_READTEMP \ + INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READTEMP) + +/** + * Request INTEL_SIP_SMC_HWMON_READVOLT + * Sync call to request voltage + * + * Call register usage: + * a0 Voltage Channel + * a1-a7 not used + * + * Return status + * a0 INTEL_SIP_SMC_STATUS_OK + * a1 Voltage Value + * a2-a3 not used + */ +#define INTEL_SIP_SMC_FUNCID_HWMON_READVOLT 33 +#define INTEL_SIP_SMC_HWMON_READVOLT \ + INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READVOLT) + #endif diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 60ed82112680..520004a5f15d 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -11,12 +11,14 @@ * * fpga: for FPGA configuration * rsu: for remote status update + * hwmon: for hardware monitoring (voltage and temperature) */ #define SVC_CLIENT_FPGA "fpga" #define SVC_CLIENT_RSU "rsu" #define SVC_CLIENT_FCS "fcs" +#define SVC_CLIENT_HWMON "hwmon" -/* +/** * Status of the sent command, in bit number * * SVC_STATUS_OK: @@ -70,6 +72,7 @@ #define SVC_RSU_REQUEST_TIMEOUT_MS 300 #define SVC_FCS_REQUEST_TIMEOUT_MS 2000 #define SVC_COMPLETED_TIMEOUT_MS 30000 +#define SVC_HWMON_REQUEST_TIMEOUT_MS 300 struct stratix10_svc_chan; @@ -171,6 +174,9 @@ enum stratix10_svc_command_code { COMMAND_MBOX_SEND_CMD = 100, /* Non-mailbox SMC Call */ COMMAND_SMC_SVC_VERSION = 200, + /* for HWMON */ + COMMAND_HWMON_READTEMP, + COMMAND_HWMON_READVOLT }; /** -- cgit v1.2.3 From bcb9f4f0706147afc62c48533276a18fe7b8f354 Mon Sep 17 00:00:00 2001 From: Mahesh Rao Date: Mon, 27 Oct 2025 22:54:41 +0800 Subject: firmware: stratix10-svc: Add support for async communication Introduce support for asynchronous communication with the Stratix10 service channel. Define new structures to enable asynchronous messaging with the Secure Device Manager (SDM). Add and remove asynchronous support for existing channels. Implement initialization and cleanup routines for the asynchronous framework. Enable sending and polling of messages to the SDM asynchronously. The new public functions added are: - stratix10_svc_add_async_client: Adds a client to the service channel. - stratix10_svc_remove_async_client: Removes an asynchronous client from the service channel. - stratix10_svc_async_send: Sends an asynchronous message to the SDM mailbox in EL3 secure firmware. - stratix10_svc_async_poll: Polls the status of an asynchronous service request in EL3 secure firmware. - stratix10_svc_async_done: Marks an asynchronous transaction as complete and frees up the resources. These changes enhance the functionality of the Stratix10 service channel by allowing for more efficient and flexible communication with the firmware. Signed-off-by: Mahesh Rao Reviewed-by: Matthew Gerlach Signed-off-by: Dinh Nguyen --- drivers/firmware/stratix10-svc.c | 656 ++++++++++++++++++++- include/linux/firmware/intel/stratix10-smc.h | 25 + .../linux/firmware/intel/stratix10-svc-client.h | 88 +++ 3 files changed, 765 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c index 9372a17d89b7..14bfa36a58ed 100644 --- a/drivers/firmware/stratix10-svc.c +++ b/drivers/firmware/stratix10-svc.c @@ -4,9 +4,12 @@ * Copyright (C) 2025, Altera Corporation */ +#include #include #include #include +#include +#include #include #include #include @@ -44,6 +47,49 @@ #define STRATIX10_RSU "stratix10-rsu" #define INTEL_FCS "intel-fcs" +/* Maximum number of SDM client IDs. */ +#define MAX_SDM_CLIENT_IDS 16 +/* Client ID for SIP Service Version 1. */ +#define SIP_SVC_V1_CLIENT_ID 0x1 +/* Maximum number of SDM job IDs. */ +#define MAX_SDM_JOB_IDS 16 +/* Number of bits used for asynchronous transaction hashing. */ +#define ASYNC_TRX_HASH_BITS 3 +/** + * Total number of transaction IDs, which is a combination of + * client ID and job ID. + */ +#define TOTAL_TRANSACTION_IDS \ + (MAX_SDM_CLIENT_IDS * MAX_SDM_JOB_IDS) + +/* Minimum major version of the ATF for Asynchronous transactions. */ +#define ASYNC_ATF_MINIMUM_MAJOR_VERSION 0x3 +/* Minimum minor version of the ATF for Asynchronous transactions.*/ +#define ASYNC_ATF_MINIMUM_MINOR_VERSION 0x0 + +/* Job ID field in the transaction ID */ +#define STRATIX10_JOB_FIELD GENMASK(3, 0) +/* Client ID field in the transaction ID */ +#define STRATIX10_CLIENT_FIELD GENMASK(7, 4) +/* Transaction ID mask for Stratix10 service layer */ +#define STRATIX10_TRANS_ID_FIELD GENMASK(7, 0) + +/* Macro to extract the job ID from a transaction ID. */ +#define STRATIX10_GET_JOBID(transaction_id) \ + (FIELD_GET(STRATIX10_JOB_FIELD, transaction_id)) +/* Macro to set the job ID in a transaction ID. */ +#define STRATIX10_SET_JOBID(jobid) \ + (FIELD_PREP(STRATIX10_JOB_FIELD, jobid)) +/* Macro to set the client ID in a transaction ID. */ +#define STRATIX10_SET_CLIENTID(clientid) \ + (FIELD_PREP(STRATIX10_CLIENT_FIELD, clientid)) +/* Macro to set a transaction ID using a client ID and a job ID. */ +#define STRATIX10_SET_TRANSACTIONID(clientid, jobid) \ + (STRATIX10_SET_CLIENTID(clientid) | STRATIX10_SET_JOBID(jobid)) +/* Macro to set a transaction ID for SIP SMC Async transactions */ +#define STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(transaction_id) \ + (FIELD_PREP(STRATIX10_TRANS_ID_FIELD, transaction_id)) + typedef void (svc_invoke_fn)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, @@ -64,7 +110,7 @@ struct stratix10_svc { * @sync_complete: state for a completion * @addr: physical address of shared memory block * @size: size of shared memory block - * @invoke_fn: function to issue secure monitor or hypervisor call + * @invoke_fn: service clients to handle secure monitor or hypervisor calls * * This struct is used to save physical address and size of shared memory * block. The shared memory blocked is allocated by secure monitor software @@ -122,6 +168,74 @@ struct stratix10_svc_data { u64 arg[3]; }; +/** + * struct stratix10_svc_async_handler - Asynchronous handler for Stratix10 + * service layer + * @transaction_id: Unique identifier for the transaction + * @achan: Pointer to the asynchronous channel structure + * @cb_arg: Argument to be passed to the callback function + * @cb: Callback function to be called upon completion + * @msg: Pointer to the client message structure + * @next: Node in the hash list + * @res: Response structure to store result from the secure firmware + * + * This structure is used to handle asynchronous transactions in the + * Stratix10 service layer. It maintains the necessary information + * for processing and completing asynchronous requests. + */ + +struct stratix10_svc_async_handler { + u8 transaction_id; + struct stratix10_async_chan *achan; + void *cb_arg; + async_callback_t cb; + struct stratix10_svc_client_msg *msg; + struct hlist_node next; + struct arm_smccc_1_2_regs res; +}; + +/** + * struct stratix10_async_chan - Structure representing an asynchronous channel + * @async_client_id: Unique client identifier for the asynchronous operation + * @job_id_pool: Pointer to the job ID pool associated with this channel + */ + +struct stratix10_async_chan { + unsigned long async_client_id; + struct ida job_id_pool; +}; + +/** + * struct stratix10_async_ctrl - Control structure for Stratix10 + * asynchronous operations + * @initialized: Flag indicating whether the control structure has + * been initialized + * @invoke_fn: Function pointer for invoking Stratix10 service calls + * to EL3 secure firmware + * @async_id_pool: Pointer to the ID pool used for asynchronous + * operations + * @common_achan_refcount: Atomic reference count for the common + * asynchronous channel usage + * @common_async_chan: Pointer to the common asynchronous channel + * structure + * @trx_list_lock: Spinlock for protecting the transaction list + * operations + * @trx_list: Hash table for managing asynchronous transactions + */ + +struct stratix10_async_ctrl { + bool initialized; + void (*invoke_fn)(struct stratix10_async_ctrl *actrl, + const struct arm_smccc_1_2_regs *args, + struct arm_smccc_1_2_regs *res); + struct ida async_id_pool; + atomic_t common_achan_refcount; + struct stratix10_async_chan *common_async_chan; + /* spinlock to protect trx_list hash table */ + spinlock_t trx_list_lock; + DECLARE_HASHTABLE(trx_list, ASYNC_TRX_HASH_BITS); +}; + /** * struct stratix10_svc_controller - service controller * @dev: device @@ -135,6 +249,7 @@ struct stratix10_svc_data { * @complete_status: state for completion * @svc_fifo_lock: protect access to service message data queue * @invoke_fn: function to issue secure monitor call or hypervisor call + * @actrl: async control structure * * This struct is used to create communication channels for service clients, to * handle secure monitor or hypervisor call. @@ -151,6 +266,7 @@ struct stratix10_svc_controller { struct completion complete_status; spinlock_t svc_fifo_lock; svc_invoke_fn *invoke_fn; + struct stratix10_async_ctrl actrl; }; /** @@ -159,15 +275,17 @@ struct stratix10_svc_controller { * @scl: pointer to service client which owns the channel * @name: service client name associated with the channel * @lock: protect access to the channel + * @async_chan: reference to asynchronous channel object for this channel * - * This struct is used by service client to communicate with service layer, each - * service client has its own channel created by service controller. + * This struct is used by service client to communicate with service layer. + * Each service client has its own channel created by service controller. */ struct stratix10_svc_chan { struct stratix10_svc_controller *ctrl; struct stratix10_svc_client *scl; char *name; spinlock_t lock; + struct stratix10_async_chan *async_chan; }; static LIST_HEAD(svc_ctrl); @@ -942,6 +1060,525 @@ struct stratix10_svc_chan *stratix10_svc_request_channel_byname( } EXPORT_SYMBOL_GPL(stratix10_svc_request_channel_byname); +/** + * stratix10_svc_add_async_client - Add an asynchronous client to the + * Stratix10 service channel. + * @chan: Pointer to the Stratix10 service channel structure. + * @use_unique_clientid: Boolean flag indicating whether to use a + * unique client ID. + * + * This function adds an asynchronous client to the specified + * Stratix10 service channel. If the `use_unique_clientid` flag is + * set to true, a unique client ID is allocated for the asynchronous + * channel. Otherwise, a common asynchronous channel is used. + * + * Return: 0 on success, or a negative error code on failure: + * -EINVAL if the channel is NULL or the async controller is + * not initialized. + * -EALREADY if the async channel is already allocated. + * -ENOMEM if memory allocation fails. + * Other negative values if ID allocation fails. + */ +int stratix10_svc_add_async_client(struct stratix10_svc_chan *chan, + bool use_unique_clientid) +{ + struct stratix10_svc_controller *ctrl; + struct stratix10_async_ctrl *actrl; + struct stratix10_async_chan *achan; + int ret = 0; + + if (!chan) + return -EINVAL; + + ctrl = chan->ctrl; + actrl = &ctrl->actrl; + + if (!actrl->initialized) { + dev_err(ctrl->dev, "Async controller not initialized\n"); + return -EINVAL; + } + + if (chan->async_chan) { + dev_err(ctrl->dev, "async channel already allocated\n"); + return -EALREADY; + } + + if (use_unique_clientid && + atomic_read(&actrl->common_achan_refcount) > 0) { + chan->async_chan = actrl->common_async_chan; + atomic_inc(&actrl->common_achan_refcount); + return 0; + } + + achan = kzalloc(sizeof(*achan), GFP_KERNEL); + if (!achan) + return -ENOMEM; + + ida_init(&achan->job_id_pool); + + ret = ida_alloc_max(&actrl->async_id_pool, MAX_SDM_CLIENT_IDS, + GFP_KERNEL); + if (ret < 0) { + dev_err(ctrl->dev, + "Failed to allocate async client id\n"); + ida_destroy(&achan->job_id_pool); + kfree(achan); + return ret; + } + + achan->async_client_id = ret; + chan->async_chan = achan; + + if (use_unique_clientid && + atomic_read(&actrl->common_achan_refcount) == 0) { + actrl->common_async_chan = achan; + atomic_inc(&actrl->common_achan_refcount); + } + + return 0; +} +EXPORT_SYMBOL_GPL(stratix10_svc_add_async_client); + +/** + * stratix10_svc_remove_async_client - Remove an asynchronous client + * from the Stratix10 service + * channel. + * @chan: Pointer to the Stratix10 service channel structure. + * + * This function removes an asynchronous client associated with the + * given service channel. It checks if the channel and the + * asynchronous channel are valid, and then proceeds to decrement + * the reference count for the common asynchronous channel if + * applicable. If the reference count reaches zero, it destroys the + * job ID pool and deallocates the asynchronous client ID. For + * non-common asynchronous channels, it directly destroys the job ID + * pool, deallocates the asynchronous client ID, and frees the + * memory allocated for the asynchronous channel. + * + * Return: 0 on success, -EINVAL if the channel or asynchronous + * channel is invalid. + */ +int stratix10_svc_remove_async_client(struct stratix10_svc_chan *chan) +{ + struct stratix10_svc_controller *ctrl; + struct stratix10_async_ctrl *actrl; + struct stratix10_async_chan *achan; + + if (!chan) + return -EINVAL; + + ctrl = chan->ctrl; + actrl = &ctrl->actrl; + achan = chan->async_chan; + + if (!achan) { + dev_err(ctrl->dev, "async channel not allocated\n"); + return -EINVAL; + } + + if (achan == actrl->common_async_chan) { + atomic_dec(&actrl->common_achan_refcount); + if (atomic_read(&actrl->common_achan_refcount) == 0) { + ida_destroy(&achan->job_id_pool); + ida_free(&actrl->async_id_pool, + achan->async_client_id); + kfree(achan); + actrl->common_async_chan = NULL; + } + } else { + ida_destroy(&achan->job_id_pool); + ida_free(&actrl->async_id_pool, achan->async_client_id); + kfree(achan); + } + chan->async_chan = NULL; + + return 0; +} +EXPORT_SYMBOL_GPL(stratix10_svc_remove_async_client); + +/** + * stratix10_svc_async_send - Send an asynchronous message to the + * Stratix10 service + * @chan: Pointer to the service channel structure + * @msg: Pointer to the message to be sent + * @handler: Pointer to the handler for the asynchronous message + * used by caller for later reference. + * @cb: Callback function to be called upon completion + * @cb_arg: Argument to be passed to the callback function + * + * This function sends an asynchronous message to the SDM mailbox in + * EL3 secure firmware. It performs various checks and setups, + * including allocating a job ID, setting up the transaction ID and + * packaging it to El3 firmware. The function handles different + * commands by setting up the appropriate arguments for the SMC call. + * If the SMC call is successful, the handler is set up and the + * function returns 0. If the SMC call fails, appropriate error + * handling is performed along with cleanup of resources. + * + * Return: 0 on success, -EINVAL for invalid argument, -ENOMEM if + * memory is not available, -EAGAIN if EL3 firmware is busy, -EBADF + * if the message is rejected by EL3 firmware and -EIO on other + * errors from EL3 firmware. + */ +int stratix10_svc_async_send(struct stratix10_svc_chan *chan, void *msg, + void **handler, async_callback_t cb, void *cb_arg) +{ + struct arm_smccc_1_2_regs args = { 0 }, res = { 0 }; + struct stratix10_svc_async_handler *handle = NULL; + struct stratix10_svc_client_msg *p_msg = + (struct stratix10_svc_client_msg *)msg; + struct stratix10_svc_controller *ctrl; + struct stratix10_async_ctrl *actrl; + struct stratix10_async_chan *achan; + int ret = 0; + + if (!chan || !msg || !handler) + return -EINVAL; + + achan = chan->async_chan; + ctrl = chan->ctrl; + actrl = &ctrl->actrl; + + if (!actrl->initialized) { + dev_err(ctrl->dev, "Async controller not initialized\n"); + return -EINVAL; + } + + if (!achan) { + dev_err(ctrl->dev, "Async channel not allocated\n"); + return -EINVAL; + } + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + ret = ida_alloc_max(&achan->job_id_pool, MAX_SDM_JOB_IDS, + GFP_KERNEL); + if (ret < 0) { + dev_err(ctrl->dev, "Failed to allocate job id\n"); + kfree(handle); + return -ENOMEM; + } + + handle->transaction_id = + STRATIX10_SET_TRANSACTIONID(achan->async_client_id, ret); + handle->cb = cb; + handle->msg = p_msg; + handle->cb_arg = cb_arg; + handle->achan = achan; + + /*set the transaction jobid in args.a1*/ + args.a1 = + STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(handle->transaction_id); + + switch (p_msg->command) { + default: + dev_err(ctrl->dev, "Invalid command ,%d\n", p_msg->command); + ret = -EINVAL; + goto deallocate_id; + } + + /** + * There is a chance that during the execution of async_send() + * in one core, an interrupt might be received in another core; + * to mitigate this we are adding the handle to the DB and then + * send the smc call. If the smc call is rejected or busy then + * we will deallocate the handle for the client to retry again. + */ + scoped_guard(spinlock_bh, &actrl->trx_list_lock) { + hash_add(actrl->trx_list, &handle->next, + handle->transaction_id); + } + + actrl->invoke_fn(actrl, &args, &res); + + switch (res.a0) { + case INTEL_SIP_SMC_STATUS_OK: + dev_dbg(ctrl->dev, + "Async message sent with transaction_id 0x%02x\n", + handle->transaction_id); + *handler = handle; + return 0; + case INTEL_SIP_SMC_STATUS_BUSY: + dev_warn(ctrl->dev, "Mailbox is busy, try after some time\n"); + ret = -EAGAIN; + break; + case INTEL_SIP_SMC_STATUS_REJECTED: + dev_err(ctrl->dev, "Async message rejected\n"); + ret = -EBADF; + break; + default: + dev_err(ctrl->dev, + "Failed to send async message ,got status as %ld\n", + res.a0); + ret = -EIO; + } + + scoped_guard(spinlock_bh, &actrl->trx_list_lock) { + hash_del(&handle->next); + } + +deallocate_id: + ida_free(&achan->job_id_pool, + STRATIX10_GET_JOBID(handle->transaction_id)); + kfree(handle); + return ret; +} +EXPORT_SYMBOL_GPL(stratix10_svc_async_send); +/** + * stratix10_svc_async_poll - Polls the status of an asynchronous + * transaction. + * @chan: Pointer to the service channel structure. + * @tx_handle: Handle to the transaction being polled. + * @data: Pointer to the callback data structure. + * + * This function polls the status of an asynchronous transaction + * identified by the given transaction handle. It ensures that the + * necessary structures are initialized and valid before proceeding + * with the poll operation. The function sets up the necessary + * arguments for the SMC call, invokes the call, and prepares the + * response data if the call is successful. If the call fails, the + * function returns the error mapped to the SVC status error. + * + * Return: 0 on success, -EINVAL if any input parameter is invalid, + * -EAGAIN if the transaction is still in progress, + * -EPERM if the command is invalid, or other negative + * error codes on failure. + */ +int stratix10_svc_async_poll(struct stratix10_svc_chan *chan, + void *tx_handle, + struct stratix10_svc_cb_data *data) +{ + struct stratix10_svc_async_handler *handle; + struct arm_smccc_1_2_regs args = { 0 }; + struct stratix10_svc_controller *ctrl; + struct stratix10_async_ctrl *actrl; + struct stratix10_async_chan *achan; + + if (!chan || !tx_handle || !data) + return -EINVAL; + + ctrl = chan->ctrl; + actrl = &ctrl->actrl; + achan = chan->async_chan; + + if (!achan) { + dev_err(ctrl->dev, "Async channel not allocated\n"); + return -EINVAL; + } + + handle = (struct stratix10_svc_async_handler *)tx_handle; + scoped_guard(spinlock_bh, &actrl->trx_list_lock) { + if (!hash_hashed(&handle->next)) { + dev_err(ctrl->dev, "Invalid transaction handler"); + return -EINVAL; + } + } + + args.a0 = INTEL_SIP_SMC_ASYNC_POLL; + args.a1 = + STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(handle->transaction_id); + + actrl->invoke_fn(actrl, &args, &handle->res); + + /*clear data for response*/ + memset(data, 0, sizeof(*data)); + + if (handle->res.a0 == INTEL_SIP_SMC_STATUS_OK) { + return 0; + } else if (handle->res.a0 == INTEL_SIP_SMC_STATUS_BUSY) { + dev_dbg(ctrl->dev, "async message is still in progress\n"); + return -EAGAIN; + } + + dev_err(ctrl->dev, + "Failed to poll async message ,got status as %ld\n", + handle->res.a0); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(stratix10_svc_async_poll); + +/** + * stratix10_svc_async_done - Completes an asynchronous transaction. + * @chan: Pointer to the service channel structure. + * @tx_handle: Handle to the transaction being completed. + * + * This function completes an asynchronous transaction identified by + * the given transaction handle. It ensures that the necessary + * structures are initialized and valid before proceeding with the + * completion operation. The function deallocates the transaction ID, + * frees the memory allocated for the handler, and removes the handler + * from the transaction list. + * + * Return: 0 on success, -EINVAL if any input parameter is invalid, + * or other negative error codes on failure. + */ +int stratix10_svc_async_done(struct stratix10_svc_chan *chan, void *tx_handle) +{ + struct stratix10_svc_async_handler *handle; + struct stratix10_svc_controller *ctrl; + struct stratix10_async_chan *achan; + struct stratix10_async_ctrl *actrl; + + if (!chan || !tx_handle) + return -EINVAL; + + ctrl = chan->ctrl; + achan = chan->async_chan; + actrl = &ctrl->actrl; + + if (!achan) { + dev_err(ctrl->dev, "async channel not allocated\n"); + return -EINVAL; + } + + handle = (struct stratix10_svc_async_handler *)tx_handle; + scoped_guard(spinlock_bh, &actrl->trx_list_lock) { + if (!hash_hashed(&handle->next)) { + dev_err(ctrl->dev, "Invalid transaction handle"); + return -EINVAL; + } + hash_del(&handle->next); + } + ida_free(&achan->job_id_pool, + STRATIX10_GET_JOBID(handle->transaction_id)); + kfree(handle); + return 0; +} +EXPORT_SYMBOL_GPL(stratix10_svc_async_done); + +static inline void stratix10_smc_1_2(struct stratix10_async_ctrl *actrl, + const struct arm_smccc_1_2_regs *args, + struct arm_smccc_1_2_regs *res) +{ + arm_smccc_1_2_smc(args, res); +} + +/** + * stratix10_svc_async_init - Initialize the Stratix10 service + * controller for asynchronous operations. + * @controller: Pointer to the Stratix10 service controller structure. + * + * This function initializes the asynchronous service controller by + * setting up the necessary data structures and initializing the + * transaction list. + * + * Return: 0 on success, -EINVAL if the controller is NULL or already + * initialized, -ENOMEM if memory allocation fails, + * -EADDRINUSE if the client ID is already reserved, or other + * negative error codes on failure. + */ +static int stratix10_svc_async_init(struct stratix10_svc_controller *controller) +{ + struct stratix10_async_ctrl *actrl; + struct arm_smccc_res res; + struct device *dev; + int ret; + + if (!controller) + return -EINVAL; + + actrl = &controller->actrl; + + if (actrl->initialized) + return -EINVAL; + + dev = controller->dev; + + controller->invoke_fn(INTEL_SIP_SMC_SVC_VERSION, 0, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != INTEL_SIP_SMC_STATUS_OK || + !(res.a1 > ASYNC_ATF_MINIMUM_MAJOR_VERSION || + (res.a1 == ASYNC_ATF_MINIMUM_MAJOR_VERSION && + res.a2 >= ASYNC_ATF_MINIMUM_MINOR_VERSION))) { + dev_err(dev, + "Intel Service Layer Driver: ATF version is not compatible for async operation\n"); + return -EINVAL; + } + + actrl->invoke_fn = stratix10_smc_1_2; + + ida_init(&actrl->async_id_pool); + + /** + * SIP_SVC_V1_CLIENT_ID is used by V1/stratix10_svc_send() clients + * for communicating with SDM synchronously. We need to restrict + * this in V3/stratix10_svc_async_send() usage to distinguish + * between V1 and V3 messages in El3 firmware. + */ + ret = ida_alloc_range(&actrl->async_id_pool, SIP_SVC_V1_CLIENT_ID, + SIP_SVC_V1_CLIENT_ID, GFP_KERNEL); + if (ret < 0) { + dev_err(dev, + "Intel Service Layer Driver: Error on reserving SIP_SVC_V1_CLIENT_ID\n"); + ida_destroy(&actrl->async_id_pool); + actrl->invoke_fn = NULL; + return -EADDRINUSE; + } + + spin_lock_init(&actrl->trx_list_lock); + hash_init(actrl->trx_list); + atomic_set(&actrl->common_achan_refcount, 0); + + actrl->initialized = true; + return 0; +} + +/** + * stratix10_svc_async_exit - Clean up and exit the asynchronous + * service controller + * @ctrl: Pointer to the stratix10_svc_controller structure + * + * This function performs the necessary cleanup for the asynchronous + * service controller. It checks if the controller is valid and if it + * has been initialized. It then locks the transaction list and safely + * removes and deallocates each handler in the list. The function also + * removes any asynchronous clients associated with the controller's + * channels and destroys the asynchronous ID pool. Finally, it resets + * the asynchronous ID pool and invoke function pointers to NULL. + * + * Return: 0 on success, -EINVAL if the controller is invalid or not + * initialized. + */ +static int stratix10_svc_async_exit(struct stratix10_svc_controller *ctrl) +{ + struct stratix10_svc_async_handler *handler; + struct stratix10_async_ctrl *actrl; + struct hlist_node *tmp; + int i; + + if (!ctrl) + return -EINVAL; + + actrl = &ctrl->actrl; + + if (!actrl->initialized) + return -EINVAL; + + actrl->initialized = false; + + scoped_guard(spinlock_bh, &actrl->trx_list_lock) { + hash_for_each_safe(actrl->trx_list, i, tmp, handler, next) { + ida_free(&handler->achan->job_id_pool, + STRATIX10_GET_JOBID(handler->transaction_id)); + hash_del(&handler->next); + kfree(handler); + } + } + + for (i = 0; i < SVC_NUM_CHANNEL; i++) { + if (ctrl->chans[i].async_chan) { + stratix10_svc_remove_async_client(&ctrl->chans[i]); + ctrl->chans[i].async_chan = NULL; + } + } + + ida_destroy(&actrl->async_id_pool); + actrl->invoke_fn = NULL; + + return 0; +} + /** * stratix10_svc_free_channel() - free service channel * @chan: service channel to be freed @@ -1197,11 +1834,18 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev) controller->invoke_fn = invoke_fn; init_completion(&controller->complete_status); + ret = stratix10_svc_async_init(controller); + if (ret) { + dev_dbg(dev, "Intel Service Layer Driver: Error on stratix10_svc_async_init %d\n", + ret); + goto err_destroy_pool; + } + fifo_size = sizeof(struct stratix10_svc_data) * SVC_NUM_DATA_IN_FIFO; ret = kfifo_alloc(&controller->svc_fifo, fifo_size, GFP_KERNEL); if (ret) { dev_err(dev, "failed to allocate FIFO\n"); - goto err_destroy_pool; + goto err_async_exit; } spin_lock_init(&controller->svc_fifo_lock); @@ -1277,6 +1921,8 @@ err_unregister_rsu_dev: platform_device_unregister(svc->stratix10_svc_rsu); err_free_kfifo: kfifo_free(&controller->svc_fifo); +err_async_exit: + stratix10_svc_async_exit(controller); err_destroy_pool: gen_pool_destroy(genpool); return ret; @@ -1287,6 +1933,8 @@ static void stratix10_svc_drv_remove(struct platform_device *pdev) struct stratix10_svc *svc = dev_get_drvdata(&pdev->dev); struct stratix10_svc_controller *ctrl = platform_get_drvdata(pdev); + stratix10_svc_async_exit(ctrl); + of_platform_depopulate(ctrl->dev); platform_device_unregister(svc->intel_svc_fcs); diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index 7306dd243b2a..3995d5d70cce 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2017-2018, Intel Corporation + * Copyright (C) 2025, Altera Corporation */ #ifndef __STRATIX10_SMC_H @@ -47,6 +48,10 @@ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64, \ ARM_SMCCC_OWNER_SIP, (func_num)) +#define INTEL_SIP_SMC_ASYNC_VAL(func_name) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_STD_CALL, ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_SIP, (func_name)) + /** * Return values in INTEL_SIP_SMC_* call * @@ -654,4 +659,24 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) #define INTEL_SIP_SMC_HWMON_READVOLT \ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READVOLT) +/** + * Request INTEL_SIP_SMC_ASYNC_POLL + * Async call used by service driver at EL1 to query mailbox response from SDM. + * + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_POLL + * a1 transaction job id + * a2-17 will be used to return the response data + * + * Return status + * a0 INTEL_SIP_SMC_STATUS_OK + * a1-17 will contain the response values from mailbox for the previous send + * transaction + * Or + * a0 INTEL_SIP_SMC_STATUS_NO_RESPONSE + * a1-17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL (0xC8) +#define INTEL_SIP_SMC_ASYNC_POLL \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL) #endif diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 520004a5f15d..532dd4bd76dd 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2017-2018, Intel Corporation + * Copyright (C) 2025, Altera Corporation */ #ifndef __STRATIX10_SVC_CLIENT_H @@ -290,5 +291,92 @@ int stratix10_svc_send(struct stratix10_svc_chan *chan, void *msg); * request process. */ void stratix10_svc_done(struct stratix10_svc_chan *chan); + +/** + * typedef async_callback_t - A type definition for an asynchronous callback function. + * + * This type defines a function pointer for an asynchronous callback. + * The callback function takes a single argument, which is a pointer to + * user-defined data. + * + * @param cb_arg A pointer to user-defined data passed to the callback function. + */ +typedef void (*async_callback_t)(void *cb_arg); + +/** + * stratix10_svc_add_async_client - Add an asynchronous client to a Stratix 10 + * service channel. + * @chan: Pointer to the Stratix 10 service channel structure. + * @use_unique_clientid: Boolean flag indicating whether to use a unique client ID. + * + * This function registers an asynchronous client with the specified Stratix 10 + * service channel. If the use_unique_clientid flag is set to true, a unique client + * ID will be assigned to the client. + * + * Return: 0 on success, or a negative error code on failure: + * -EINVAL if the channel is NULL or the async controller is not initialized. + * -EALREADY if the async channel is already allocated. + * -ENOMEM if memory allocation fails. + * Other negative values if ID allocation fails + */ +int stratix10_svc_add_async_client(struct stratix10_svc_chan *chan, bool use_unique_clientid); + +/** + * stratix10_svc_remove_async_client - Remove an asynchronous client from the Stratix 10 + * service channel. + * @chan: Pointer to the Stratix 10 service channel structure. + * + * This function removes an asynchronous client from the specified Stratix 10 service channel. + * It is typically used to clean up and release resources associated with the client. + * + * Return: 0 on success, -EINVAL if the channel or asynchronous channel is invalid. + */ +int stratix10_svc_remove_async_client(struct stratix10_svc_chan *chan); + +/** + * stratix10_svc_async_send - Send an asynchronous message to the SDM mailbox + * in EL3 secure firmware. + * @chan: Pointer to the service channel structure. + * @msg: Pointer to the message to be sent. + * @handler: Pointer to the handler object used by caller to track the transaction. + * @cb: Callback function to be called upon completion. + * @cb_arg: Argument to be passed to the callback function. + * + * This function sends a message asynchronously to the SDM mailbox in EL3 secure firmware. + * and registers a callback function to be invoked when the operation completes. + * + * Return: 0 on success,and negative error codes on failure. + */ +int stratix10_svc_async_send(struct stratix10_svc_chan *chan, void *msg, void **handler, + async_callback_t cb, void *cb_arg); + +/** + * stratix10_svc_async_poll - Polls the status of an asynchronous service request. + * @chan: Pointer to the service channel structure. + * @tx_handle: Handle to the transaction being polled. + * @data: Pointer to the callback data structure to be filled with the result. + * + * This function checks the status of an asynchronous service request + * and fills the provided callback data structure with the result. + * + * Return: 0 on success, -EINVAL if any input parameter is invalid or if the + * async controller is not initialized, -EAGAIN if the transaction is + * still in progress, or other negative error codes on failure. + */ +int stratix10_svc_async_poll(struct stratix10_svc_chan *chan, void *tx_handle, + struct stratix10_svc_cb_data *data); + +/** + * stratix10_svc_async_done - Complete an asynchronous transaction + * @chan: Pointer to the service channel structure + * @tx_handle: Pointer to the transaction handle + * + * This function completes an asynchronous transaction by removing the + * transaction from the hash table and deallocating the associated resources. + * + * Return: 0 on success, -EINVAL on invalid input or errors. + */ +int stratix10_svc_async_done(struct stratix10_svc_chan *chan, void *tx_handle); + #endif -- cgit v1.2.3 From ec52379341a1209826c3e0ae53674393724d2071 Mon Sep 17 00:00:00 2001 From: Mahesh Rao Date: Mon, 27 Oct 2025 22:54:42 +0800 Subject: firmware: stratix10-svc: Add support for RSU commands in asynchronous framework Integrate Remote System Update(RSU) service commands into the asynchronous framework for communicating with SDM. This allows the RSU commands to be processed asynchronously, improving the responsiveness of the Stratix10 service channel. The asynchronous framework now supports the following RSU commands: * COMMAND_RSU_GET_SPT_TABLE * COMMAND_RSU_STATUS * COMMAND_RSU_NOTIFY Signed-off-by: Mahesh Rao Reviewed-by: Matthew Gerlach Signed-off-by: Dinh Nguyen --- drivers/firmware/stratix10-svc.c | 72 ++++++++++++++++++++++ include/linux/firmware/intel/stratix10-smc.h | 52 ++++++++++++++++ .../linux/firmware/intel/stratix10-svc-client.h | 4 ++ 3 files changed, 128 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c index 14bfa36a58ed..3acfa067c5dd 100644 --- a/drivers/firmware/stratix10-svc.c +++ b/drivers/firmware/stratix10-svc.c @@ -90,6 +90,12 @@ #define STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(transaction_id) \ (FIELD_PREP(STRATIX10_TRANS_ID_FIELD, transaction_id)) +/* 10-bit mask for extracting the SDM status code */ +#define STRATIX10_SDM_STATUS_MASK GENMASK(9, 0) +/* Macro to get the SDM mailbox error status */ +#define STRATIX10_GET_SDM_STATUS_CODE(status) \ + (FIELD_GET(STRATIX10_SDM_STATUS_MASK, status)) + typedef void (svc_invoke_fn)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, @@ -1273,6 +1279,16 @@ int stratix10_svc_async_send(struct stratix10_svc_chan *chan, void *msg, STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(handle->transaction_id); switch (p_msg->command) { + case COMMAND_RSU_GET_SPT_TABLE: + args.a0 = INTEL_SIP_SMC_ASYNC_RSU_GET_SPT; + break; + case COMMAND_RSU_STATUS: + args.a0 = INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS; + break; + case COMMAND_RSU_NOTIFY: + args.a0 = INTEL_SIP_SMC_ASYNC_RSU_NOTIFY; + args.a2 = p_msg->arg[0]; + break; default: dev_err(ctrl->dev, "Invalid command ,%d\n", p_msg->command); ret = -EINVAL; @@ -1326,6 +1342,56 @@ deallocate_id: return ret; } EXPORT_SYMBOL_GPL(stratix10_svc_async_send); + +/** + * stratix10_svc_async_prepare_response - Prepare the response data for + * an asynchronous transaction. + * @chan: Pointer to the service channel structure. + * @handle: Pointer to the asynchronous handler structure. + * @data: Pointer to the callback data structure. + * + * This function prepares the response data for an asynchronous transaction. It + * extracts the response data from the SMC response structure and stores it in + * the callback data structure. The function also logs the completion of the + * asynchronous transaction. + * + * Return: 0 on success, -ENOENT if the command is invalid + */ +static int stratix10_svc_async_prepare_response(struct stratix10_svc_chan *chan, + struct stratix10_svc_async_handler *handle, + struct stratix10_svc_cb_data *data) +{ + struct stratix10_svc_client_msg *p_msg = + (struct stratix10_svc_client_msg *)handle->msg; + struct stratix10_svc_controller *ctrl = chan->ctrl; + + data->status = STRATIX10_GET_SDM_STATUS_CODE(handle->res.a1); + + switch (p_msg->command) { + case COMMAND_RSU_NOTIFY: + break; + case COMMAND_RSU_GET_SPT_TABLE: + data->kaddr1 = (void *)&handle->res.a2; + data->kaddr2 = (void *)&handle->res.a3; + break; + case COMMAND_RSU_STATUS: + /* COMMAND_RSU_STATUS has more elements than the cb_data + * can acomodate, so passing the response structure to the + * response function to be handled before done command is + * executed by the client. + */ + data->kaddr1 = (void *)&handle->res; + break; + + default: + dev_alert(ctrl->dev, "Invalid command\n ,%d", p_msg->command); + return -ENOENT; + } + dev_dbg(ctrl->dev, "Async message completed transaction_id 0x%02x\n", + handle->transaction_id); + return 0; +} + /** * stratix10_svc_async_poll - Polls the status of an asynchronous * transaction. @@ -1355,6 +1421,7 @@ int stratix10_svc_async_poll(struct stratix10_svc_chan *chan, struct stratix10_svc_controller *ctrl; struct stratix10_async_ctrl *actrl; struct stratix10_async_chan *achan; + int ret; if (!chan || !tx_handle || !data) return -EINVAL; @@ -1386,6 +1453,11 @@ int stratix10_svc_async_poll(struct stratix10_svc_chan *chan, memset(data, 0, sizeof(*data)); if (handle->res.a0 == INTEL_SIP_SMC_STATUS_OK) { + ret = stratix10_svc_async_prepare_response(chan, handle, data); + if (ret) { + dev_err(ctrl->dev, "Error in preparation of response,%d\n", ret); + WARN_ON_ONCE(1); + } return 0; } else if (handle->res.a0 == INTEL_SIP_SMC_STATUS_BUSY) { dev_dbg(ctrl->dev, "async message is still in progress\n"); diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index 3995d5d70cce..935dba3633b5 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -679,4 +679,56 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) #define INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL (0xC8) #define INTEL_SIP_SMC_ASYNC_POLL \ INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL) + +/** + * Request INTEL_SIP_SMC_ASYNC_RSU_GET_SPT + * Async call to get RSU SPT from SDM. + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_RSU_GET_SPT + * a1 transaction job id + * a2-a17 not used + * + * Return status: + * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED + * or INTEL_SIP_SMC_STATUS_BUSY + * a1-a17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_SPT (0xEA) +#define INTEL_SIP_SMC_ASYNC_RSU_GET_SPT \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_SPT) + +/** + * Request INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS + * Async call to get RSU error status from SDM. + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS + * a1 transaction job id + * a2-a17 not used + * + * Return status: + * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED + * or INTEL_SIP_SMC_STATUS_BUSY + * a1-a17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_ERROR_STATUS (0xEB) +#define INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_ERROR_STATUS) + +/** + * Request INTEL_SIP_SMC_ASYNC_RSU_NOTIFY + * Async call to send NOTIFY value to SDM. + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_RSU_NOTIFY + * a1 transaction job id + * a2 notify value + * a3-a17 not used + * + * Return status: + * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED + * or INTEL_SIP_SMC_STATUS_BUSY + * a1-a17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_NOTIFY (0xEC) +#define INTEL_SIP_SMC_ASYNC_RSU_NOTIFY \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_NOTIFY) #endif diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 532dd4bd76dd..1bcc56d14080 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -128,6 +128,9 @@ struct stratix10_svc_chan; * @COMMAND_RSU_DCMF_STATUS: query firmware for the DCMF status * return status is SVC_STATUS_OK or SVC_STATUS_ERROR * + * @COMMAND_RSU_GET_SPT_TABLE: query firmware for SPT table + * return status is SVC_STATUS_OK or SVC_STATUS_ERROR + * * @COMMAND_FCS_REQUEST_SERVICE: request validation of image from firmware, * return status is SVC_STATUS_OK, SVC_STATUS_INVALID_PARAM * @@ -162,6 +165,7 @@ enum stratix10_svc_command_code { COMMAND_RSU_DCMF_VERSION, COMMAND_RSU_DCMF_STATUS, COMMAND_FIRMWARE_VERSION, + COMMAND_RSU_GET_SPT_TABLE, /* for FCS */ COMMAND_FCS_REQUEST_SERVICE = 20, COMMAND_FCS_SEND_CERTIFICATE, -- cgit v1.2.3 From 524c3853831cf4f7e1db579e487c757c3065165c Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 22 Oct 2025 20:11:37 +0900 Subject: jbd2: use a per-journal lock_class_key for jbd2_trans_commit_key syzbot is reporting possibility of deadlock due to sharing lock_class_key for jbd2_handle across ext4 and ocfs2. But this is a false positive, for one disk partition can't have two filesystems at the same time. Reported-by: syzbot+6e493c165d26d6fcbf72@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6e493c165d26d6fcbf72 Signed-off-by: Tetsuo Handa Tested-by: syzbot+6e493c165d26d6fcbf72@syzkaller.appspotmail.com Reviewed-by: Jan Kara Message-ID: <987110fc-5470-457a-a218-d286a09dd82f@I-love.SAKURA.ne.jp> Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/journal.c | 6 ++++-- include/linux/jbd2.h | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index d480b94117cd..f43474002f50 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1521,7 +1521,6 @@ static journal_t *journal_init_common(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int blocksize) { - static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; int n; @@ -1530,6 +1529,7 @@ static journal_t *journal_init_common(struct block_device *bdev, if (!journal) return ERR_PTR(-ENOMEM); + lockdep_register_key(&journal->jbd2_trans_commit_key); journal->j_blocksize = blocksize; journal->j_dev = bdev; journal->j_fs_dev = fs_dev; @@ -1560,7 +1560,7 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_max_batch_time = 15000; /* 15ms */ atomic_set(&journal->j_reserved_credits, 0); lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", - &jbd2_trans_commit_key, 0); + &journal->jbd2_trans_commit_key, 0); /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; @@ -1611,6 +1611,7 @@ err_cleanup: kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); journal_fail_superblock(journal); + lockdep_unregister_key(&journal->jbd2_trans_commit_key); kfree(journal); return ERR_PTR(err); } @@ -2187,6 +2188,7 @@ int jbd2_journal_destroy(journal_t *journal) jbd2_journal_destroy_revoke(journal); kfree(journal->j_fc_wbuf); kfree(journal->j_wbuf); + lockdep_unregister_key(&journal->jbd2_trans_commit_key); kfree(journal); return err; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 43b9297fe8a7..f5eaf76198f3 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1253,6 +1253,12 @@ struct journal_s */ struct lockdep_map j_trans_commit_map; #endif + /** + * @jbd2_trans_commit_key: + * + * "struct lock_class_key" for @j_trans_commit_map + */ + struct lock_class_key jbd2_trans_commit_key; /** * @j_fc_cleanup_callback: -- cgit v1.2.3 From ebd4469e7af61019daaf904fdcba07a9ecd18440 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Wed, 5 Nov 2025 14:40:32 +1100 Subject: entry: Fix ifndef around arch_xfer_to_guest_mode_handle_work() stub The stub implementation of arch_xfer_to_guest_mode_handle_work() is guarded by an #ifndef that incorrectly checks for the name arch_xfer_to_guest_mode_work instead. It seems the function was renamed to add "_handle" as a late change to the original patch, and the #ifndef wasn't updated to go with it. Change the #ifndef to match the name of the function. No users right now, so no need to update any architecture code. Fixes: 935ace2fb5cc4 ("entry: Provide infrastructure for work before transitioning to guest mode") Signed-off-by: Andrew Donnellan Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251105-entry-fix-ifndef-v1-1-d8d28045b627@linux.ibm.com --- include/linux/entry-virt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/entry-virt.h b/include/linux/entry-virt.h index 42c89e3e5ca7..bfa767702d9a 100644 --- a/include/linux/entry-virt.h +++ b/include/linux/entry-virt.h @@ -32,7 +32,7 @@ */ static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work); -#ifndef arch_xfer_to_guest_mode_work +#ifndef arch_xfer_to_guest_mode_handle_work static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work) { return 0; -- cgit v1.2.3 From f694d215d34035cc64b1d176fd82db0d1f2428d4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 11 Nov 2025 11:26:44 +0000 Subject: net: stmmac: always allocate mac_device_info The ->setup() method implemented by dwmac-loongson and dwmac-sun8i allocate the mac_device_info structure, as does stmmac_hwif_init(). This makes no sense. Have stmmac_hwif_init() always allocate this structure, and pass it to the ->setup() method to initialise when it is provided. Rename this method to "mac_setup" to more accurately describe what it is doing. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vImWK-0000000DrIx-28vO@rmk-PC.armlinux.org.uk Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 13 ++++--------- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 11 +++-------- drivers/net/ethernet/stmicro/stmmac/hwif.c | 16 +++++++++------- include/linux/stmmac.h | 4 +++- 4 files changed, 19 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 2a3ac0136cdb..dd2fc39ec3e2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -320,10 +320,9 @@ static int loongson_dwmac_dma_interrupt(struct stmmac_priv *priv, return ret; } -static struct mac_device_info *loongson_dwmac_setup(void *apriv) +static int loongson_dwmac_setup(void *apriv, struct mac_device_info *mac) { struct stmmac_priv *priv = apriv; - struct mac_device_info *mac; struct stmmac_dma_ops *dma; struct loongson_data *ld; struct pci_dev *pdev; @@ -331,13 +330,9 @@ static struct mac_device_info *loongson_dwmac_setup(void *apriv) ld = priv->plat->bsp_priv; pdev = to_pci_dev(priv->device); - mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL); - if (!mac) - return NULL; - dma = devm_kzalloc(priv->device, sizeof(*dma), GFP_KERNEL); if (!dma) - return NULL; + return -ENOMEM; /* The Loongson GMAC and GNET devices are based on the DW GMAC * v3.50a and v3.73a IP-cores. But the HW designers have changed @@ -396,7 +391,7 @@ static struct mac_device_info *loongson_dwmac_setup(void *apriv) mac->mii.clk_csr_shift = 2; mac->mii.clk_csr_mask = GENMASK(5, 2); - return mac; + return 0; } static int loongson_dwmac_msi_config(struct pci_dev *pdev, @@ -598,7 +593,7 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id goto err_disable_device; plat->bsp_priv = ld; - plat->setup = loongson_dwmac_setup; + plat->mac_setup = loongson_dwmac_setup; plat->fix_soc_reset = loongson_dwmac_fix_reset; plat->suspend = loongson_dwmac_suspend; plat->resume = loongson_dwmac_resume; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 5d871b2cd111..7434d4bbb526 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1040,15 +1040,10 @@ static const struct stmmac_ops sun8i_dwmac_ops = { .set_mac_loopback = sun8i_dwmac_set_mac_loopback, }; -static struct mac_device_info *sun8i_dwmac_setup(void *ppriv) +static int sun8i_dwmac_setup(void *ppriv, struct mac_device_info *mac) { - struct mac_device_info *mac; struct stmmac_priv *priv = ppriv; - mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL); - if (!mac) - return NULL; - mac->pcsr = priv->ioaddr; mac->mac = &sun8i_dwmac_ops; mac->dma = &sun8i_dwmac_dma_ops; @@ -1079,7 +1074,7 @@ static struct mac_device_info *sun8i_dwmac_setup(void *ppriv) /* Synopsys Id is not available */ priv->synopsys_id = 0; - return mac; + return 0; } static struct regmap *sun8i_dwmac_get_syscon_from_dev(struct device_node *node) @@ -1192,7 +1187,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) plat_dat->bsp_priv = gmac; plat_dat->init = sun8i_dwmac_init; plat_dat->exit = sun8i_dwmac_exit; - plat_dat->setup = sun8i_dwmac_setup; + plat_dat->mac_setup = sun8i_dwmac_setup; plat_dat->tx_fifo_size = 4096; plat_dat->rx_fifo_size = 16384; diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index ee612cadbd77..014f7cd79a3c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -347,17 +347,19 @@ int stmmac_hwif_init(struct stmmac_priv *priv) priv->estaddr = priv->ioaddr + EST_XGMAC_OFFSET; } + mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL); + if (!mac) + return -ENOMEM; + /* Check for HW specific setup first */ - if (priv->plat->setup) { - mac = priv->plat->setup(priv); + if (priv->plat->mac_setup) { + ret = priv->plat->mac_setup(priv, mac); + if (ret) + return ret; + needs_setup = false; - } else { - mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL); } - if (!mac) - return -ENOMEM; - spin_lock_init(&mac->irq_ctrl_lock); /* Fallback to generic HW */ diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 48e9f1d4e17e..4f70a6551e68 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -192,6 +192,8 @@ enum dwmac_core_type { #define STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP BIT(12) #define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY BIT(13) +struct mac_device_info; + struct plat_stmmacenet_data { enum dwmac_core_type core_type; int bus_id; @@ -266,7 +268,7 @@ struct plat_stmmacenet_data { void (*exit)(struct platform_device *pdev, void *priv); int (*suspend)(struct device *dev, void *priv); int (*resume)(struct device *dev, void *priv); - struct mac_device_info *(*setup)(void *priv); + int (*mac_setup)(void *priv, struct mac_device_info *mac); int (*clks_config)(void *priv, bool enabled); int (*crosststamp)(ktime_t *device, struct system_counterval_t *system, void *ctx); -- cgit v1.2.3 From 0f2620ffc41d117cc28bc053efe2dc837cf748dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:42 +0100 Subject: fault-inject: make enum fault_flags available unconditionally This will allow using should_fail_ex from code without having to make it conditional on CONFIG_FAULT_INJECTION. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-2-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/fault-inject.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 8c829d28dcf3..58fd14c82270 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -8,6 +8,10 @@ struct dentry; struct kmem_cache; +enum fault_flags { + FAULT_NOWARN = 1 << 0, +}; + #ifdef CONFIG_FAULT_INJECTION #include @@ -36,10 +40,6 @@ struct fault_attr { struct dentry *dname; }; -enum fault_flags { - FAULT_NOWARN = 1 << 0, -}; - #define FAULT_ATTR_INITIALIZER { \ .interval = 1, \ .times = ATOMIC_INIT(1), \ -- cgit v1.2.3 From 36640d21fdfe0152c96e6cb9b58e3336291dfbaa Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Wed, 29 Oct 2025 13:34:49 +0530 Subject: PCI: Export pci_get_host_bridge_device() for use by pci-keystone The pci-keystone.c driver uses the 'pci_get_host_bridge_device()' helper. Export it in preparation for enabling the pci-keystone.c driver to be built as a loadable module. Signed-off-by: Siddharth Vadapalli Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20251029080547.1253757-2-s-vadapalli@ti.com --- drivers/pci/host-bridge.c | 1 + include/linux/pci.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c index afa50b446567..be5ef6516cff 100644 --- a/drivers/pci/host-bridge.c +++ b/drivers/pci/host-bridge.c @@ -33,6 +33,7 @@ struct device *pci_get_host_bridge_device(struct pci_dev *dev) kobject_get(&bridge->kobj); return bridge; } +EXPORT_SYMBOL_GPL(pci_get_host_bridge_device); void pci_put_host_bridge_device(struct device *dev) { diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..b253cbc27d36 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -646,6 +646,7 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv); struct pci_host_bridge *devm_pci_alloc_host_bridge(struct device *dev, size_t priv); void pci_free_host_bridge(struct pci_host_bridge *bridge); +struct device *pci_get_host_bridge_device(struct pci_dev *dev); struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus); void pci_set_host_bridge_release(struct pci_host_bridge *bridge, -- cgit v1.2.3 From 8d63e85c5b50f1dbfa0ccb214bd91fe5d7e2e860 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 4 Nov 2025 11:26:53 -0800 Subject: firmware: cs_dsp: fix kernel-doc warnings in a header file Use correct kernel-doc format to avoid kernel-doc warnings in nclude/linux/firmware/cirrus/cs_dsp_test_utils.h: - mark one struct member as private: since the comment says that it is private - add ending ':' to struct members where needed Warning: include/linux/firmware/cirrus/cs_dsp_test_utils.h:30 struct member 'saw_bus_write' not described in 'cs_dsp_test' Warning: include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'id' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'ver' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'xm_base_words' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'xm_size_words' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'ym_base_words' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'ym_size_words' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'zm_base_words' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'zm_size_words' not described in 'cs_dsp_mock_alg_def' Signed-off-by: Randy Dunlap Reviewed-by: Richard Fitzgerald Link: https://patch.msgid.link/20251104192653.929157-1-rdunlap@infradead.org Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h index ecd821ed8064..1f97764fdfd7 100644 --- a/include/linux/firmware/cirrus/cs_dsp_test_utils.h +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -26,21 +26,21 @@ struct cs_dsp_test { struct cs_dsp_test_local *local; - /* Following members are private */ + /* private: Following members are private */ bool saw_bus_write; }; /** * struct cs_dsp_mock_alg_def - Info for creating a mock algorithm entry. * - * @id Algorithm ID. - * @ver; Algorithm version. - * @xm_base_words XM base address in DSP words. - * @xm_size_words XM size in DSP words. - * @ym_base_words YM base address in DSP words. - * @ym_size_words YM size in DSP words. - * @zm_base_words ZM base address in DSP words. - * @zm_size_words ZM size in DSP words. + * @id: Algorithm ID. + * @ver: Algorithm version. + * @xm_base_words: XM base address in DSP words. + * @xm_size_words: XM size in DSP words. + * @ym_base_words: YM base address in DSP words. + * @ym_size_words: YM size in DSP words. + * @zm_base_words: ZM base address in DSP words. + * @zm_size_words: ZM size in DSP words. */ struct cs_dsp_mock_alg_def { unsigned int id; -- cgit v1.2.3 From e5b5f8b7c26f72fe86b59979e51d8e6cf36ea903 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:40 -0800 Subject: PCI/TSM: Drop stub for pci_tsm_doe_transfer() Just like pci_tsm_pf0_{con,de}structor(), in the CONFIG_PCI_TSM=n case there should be no callers of pci_tsm_doe_transfer(). Reported-by: Xu Yilun Closes: http://lore.kernel.org/aRFfk14DJWEVhC/R@yilunxu-OptiPlex-7050 Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-3-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-tsm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h index e921d30f9b6c..d7b078d5e272 100644 --- a/include/linux/pci-tsm.h +++ b/include/linux/pci-tsm.h @@ -147,11 +147,5 @@ static inline int pci_tsm_register(struct tsm_dev *tsm_dev) static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev) { } -static inline int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, - const void *req, size_t req_sz, - void *resp, size_t resp_sz) -{ - return -ENXIO; -} #endif #endif /*__PCI_TSM_H */ -- cgit v1.2.3 From c16af019d9d6d23f211c82b5561f2ecd2a7dff54 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:41 -0800 Subject: resource: Introduce resource_assigned() for discerning active resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A PCI bridge resource lifecycle involves both a "request" and "assign" phase. At any point in time that resource may not yet be assigned, or may have failed to assign (because it does not fit). There are multiple conventions to determine when assignment has not completed: IORESOURCE_UNSET, IORESOURCE_DISABLED, and checking whether the resource is parented. In code paths that are known to not be racing assignment, e.g. post subsys_initcall(), the most reliable method to judge that a bridge resource is assigned is to check the resource is parented [1]. Introduce a resource_assigned() helper for this purpose. Link: http://lore.kernel.org/2b9f7f7b-d6a4-be59-14d4-7b4ffccfe373@linux.intel.com [1] Suggested-by: Ilpo Järvinen Cc: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-4-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/ioport.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index e8b2d6aa4013..9afa30f9346f 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -334,6 +334,15 @@ static inline bool resource_union(const struct resource *r1, const struct resour return true; } +/* + * Check if this resource is added to a resource tree or detached. Caller is + * responsible for not racing assignment. + */ +static inline bool resource_assigned(struct resource *res) +{ + return res->parent; +} + int find_resource_space(struct resource *root, struct resource *new, resource_size_t size, struct resource_constraint *constraint); -- cgit v1.2.3 From a97fbc3ee3e2a536fafaff04f21f45472db71769 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 29 Oct 2025 17:33:30 +0100 Subject: syscore: Pass context data to callbacks Several drivers can benefit from registering per-instance data along with the syscore operations. To achieve this, move the modifiable fields out of the syscore_ops structure and into a separate struct syscore that can be registered with the framework. Add a void * driver data field for drivers to store contextual data that will be passed to the syscore ops. Acked-by: Rafael J. Wysocki (Intel) Signed-off-by: Thierry Reding --- arch/arm/mach-exynos/mcpm-exynos.c | 12 +++-- arch/arm/mach-exynos/suspend.c | 48 +++++++++++------- arch/arm/mach-pxa/generic.h | 6 +-- arch/arm/mach-pxa/irq.c | 10 ++-- arch/arm/mach-pxa/mfp-pxa2xx.c | 10 ++-- arch/arm/mach-pxa/mfp-pxa3xx.c | 10 ++-- arch/arm/mach-pxa/pxa25x.c | 4 +- arch/arm/mach-pxa/pxa27x.c | 4 +- arch/arm/mach-pxa/pxa3xx.c | 4 +- arch/arm/mach-pxa/smemc.c | 12 +++-- arch/arm/mach-s3c/irq-pm-s3c64xx.c | 12 +++-- arch/arm/mach-s5pv210/pm.c | 10 ++-- arch/arm/mach-versatile/integrator_ap.c | 12 +++-- arch/arm/mm/cache-b15-rac.c | 12 +++-- arch/loongarch/kernel/smp.c | 12 +++-- arch/mips/alchemy/common/dbdma.c | 12 +++-- arch/mips/alchemy/common/irq.c | 24 ++++++--- arch/mips/alchemy/common/usb.c | 12 +++-- arch/mips/pci/pci-alchemy.c | 16 +++--- arch/powerpc/platforms/cell/spu_base.c | 10 ++-- arch/powerpc/platforms/powermac/pic.c | 12 +++-- arch/powerpc/sysdev/fsl_lbc.c | 12 +++-- arch/powerpc/sysdev/fsl_pci.c | 12 +++-- arch/powerpc/sysdev/ipic.c | 12 +++-- arch/powerpc/sysdev/mpic.c | 14 ++++-- arch/powerpc/sysdev/mpic_timer.c | 10 ++-- arch/sh/mm/pmb.c | 10 ++-- arch/x86/events/amd/ibs.c | 12 +++-- arch/x86/hyperv/hv_init.c | 12 +++-- arch/x86/kernel/amd_gart_64.c | 10 ++-- arch/x86/kernel/apic/apic.c | 12 +++-- arch/x86/kernel/apic/io_apic.c | 17 +++++-- arch/x86/kernel/cpu/aperfmperf.c | 20 +++++--- arch/x86/kernel/cpu/intel_epb.c | 16 +++--- arch/x86/kernel/cpu/mce/core.c | 14 ++++-- arch/x86/kernel/cpu/microcode/core.c | 15 ++++-- arch/x86/kernel/cpu/mtrr/legacy.c | 12 +++-- arch/x86/kernel/cpu/umwait.c | 10 ++-- arch/x86/kernel/i8237.c | 10 ++-- arch/x86/kernel/i8259.c | 14 ++++-- arch/x86/kernel/kvm.c | 12 +++-- drivers/acpi/pci_link.c | 10 ++-- drivers/acpi/sleep.c | 12 +++-- drivers/base/firmware_loader/main.c | 12 +++-- drivers/base/syscore.c | 82 ++++++++++++++++--------------- drivers/bus/mvebu-mbus.c | 16 +++--- drivers/clk/at91/pmc.c | 12 +++-- drivers/clk/imx/clk-vf610.c | 12 +++-- drivers/clk/ingenic/jz4725b-cgu.c | 2 +- drivers/clk/ingenic/jz4740-cgu.c | 2 +- drivers/clk/ingenic/jz4755-cgu.c | 2 +- drivers/clk/ingenic/jz4760-cgu.c | 2 +- drivers/clk/ingenic/jz4770-cgu.c | 2 +- drivers/clk/ingenic/jz4780-cgu.c | 2 +- drivers/clk/ingenic/pm.c | 14 ++++-- drivers/clk/ingenic/pm.h | 2 +- drivers/clk/ingenic/tcu.c | 12 +++-- drivers/clk/ingenic/x1000-cgu.c | 2 +- drivers/clk/ingenic/x1830-cgu.c | 2 +- drivers/clk/mvebu/common.c | 12 +++-- drivers/clk/rockchip/clk-rk3288.c | 12 +++-- drivers/clk/samsung/clk-s5pv210-audss.c | 12 +++-- drivers/clk/samsung/clk.c | 12 +++-- drivers/clk/tegra/clk-tegra210.c | 12 +++-- drivers/clocksource/timer-armada-370-xp.c | 12 +++-- drivers/cpuidle/cpuidle-psci.c | 12 +++-- drivers/gpio/gpio-mxc.c | 12 +++-- drivers/gpio/gpio-pxa.c | 12 +++-- drivers/gpio/gpio-sa1100.c | 12 +++-- drivers/hv/vmbus_drv.c | 14 ++++-- drivers/iommu/amd/init.c | 16 +++--- drivers/iommu/intel/iommu.c | 12 +++-- drivers/irqchip/exynos-combiner.c | 14 ++++-- drivers/irqchip/irq-armada-370-xp.c | 12 +++-- drivers/irqchip/irq-bcm7038-l1.c | 12 +++-- drivers/irqchip/irq-gic-v3-its.c | 12 +++-- drivers/irqchip/irq-i8259.c | 12 +++-- drivers/irqchip/irq-imx-gpcv2.c | 16 +++--- drivers/irqchip/irq-loongson-eiointc.c | 12 +++-- drivers/irqchip/irq-loongson-htpic.c | 10 ++-- drivers/irqchip/irq-loongson-htvec.c | 12 +++-- drivers/irqchip/irq-loongson-pch-lpc.c | 12 +++-- drivers/irqchip/irq-loongson-pch-pic.c | 12 +++-- drivers/irqchip/irq-mchp-eic.c | 12 +++-- drivers/irqchip/irq-mst-intc.c | 12 +++-- drivers/irqchip/irq-mtk-cirq.c | 12 +++-- drivers/irqchip/irq-renesas-rzg2l.c | 12 +++-- drivers/irqchip/irq-sa11x0.c | 12 +++-- drivers/irqchip/irq-sifive-plic.c | 12 +++-- drivers/irqchip/irq-sun6i-r.c | 18 ++++--- drivers/irqchip/irq-tegra.c | 12 +++-- drivers/irqchip/irq-vic.c | 12 +++-- drivers/leds/trigger/ledtrig-cpu.c | 14 ++++-- drivers/macintosh/via-pmu.c | 12 +++-- drivers/power/reset/sc27xx-poweroff.c | 10 ++-- drivers/sh/clk/core.c | 10 ++-- drivers/sh/intc/core.c | 12 +++-- drivers/soc/bcm/brcmstb/biuctrl.c | 12 +++-- drivers/soc/tegra/pmc.c | 17 ++++--- drivers/thermal/intel/intel_hfi.c | 12 +++-- drivers/xen/xen-acpi-processor.c | 12 +++-- include/linux/syscore_ops.h | 15 ++++-- kernel/cpu_pm.c | 12 +++-- kernel/irq/generic-chip.c | 14 ++++-- kernel/irq/pm.c | 11 +++-- kernel/printk/printk.c | 11 +++-- kernel/time/sched_clock.c | 22 +++++++-- kernel/time/timekeeping.c | 22 +++++++-- virt/kvm/kvm_main.c | 18 ++++--- 109 files changed, 898 insertions(+), 470 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-exynos/mcpm-exynos.c b/arch/arm/mach-exynos/mcpm-exynos.c index fd0dbeb93357..cb7d8a7b14e0 100644 --- a/arch/arm/mach-exynos/mcpm-exynos.c +++ b/arch/arm/mach-exynos/mcpm-exynos.c @@ -215,7 +215,7 @@ static const struct of_device_id exynos_dt_mcpm_match[] = { {}, }; -static void exynos_mcpm_setup_entry_point(void) +static void exynos_mcpm_setup_entry_point(void *data) { /* * U-Boot SPL is hardcoded to jump to the start of ns_sram_base_addr @@ -228,10 +228,14 @@ static void exynos_mcpm_setup_entry_point(void) __raw_writel(__pa_symbol(mcpm_entry_point), ns_sram_base_addr + 8); } -static struct syscore_ops exynos_mcpm_syscore_ops = { +static const struct syscore_ops exynos_mcpm_syscore_ops = { .resume = exynos_mcpm_setup_entry_point, }; +static struct syscore exynos_mcpm_syscore = { + .ops = &exynos_mcpm_syscore_ops, +}; + static int __init exynos_mcpm_init(void) { struct device_node *node; @@ -300,9 +304,9 @@ static int __init exynos_mcpm_init(void) pmu_raw_writel(value, EXYNOS_COMMON_OPTION(i)); } - exynos_mcpm_setup_entry_point(); + exynos_mcpm_setup_entry_point(NULL); - register_syscore_ops(&exynos_mcpm_syscore_ops); + register_syscore(&exynos_mcpm_syscore); return ret; } diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c index 150a1e56dcae..22d723553f62 100644 --- a/arch/arm/mach-exynos/suspend.c +++ b/arch/arm/mach-exynos/suspend.c @@ -53,9 +53,9 @@ struct exynos_pm_data { void (*pm_prepare)(void); void (*pm_resume_prepare)(void); - void (*pm_resume)(void); - int (*pm_suspend)(void); int (*cpu_suspend)(unsigned long); + + const struct syscore_ops *syscore_ops; }; /* Used only on Exynos542x/5800 */ @@ -376,7 +376,7 @@ static void exynos5420_pm_prepare(void) } -static int exynos_pm_suspend(void) +static int exynos_pm_suspend(void *data) { exynos_pm_central_suspend(); @@ -390,7 +390,7 @@ static int exynos_pm_suspend(void) return 0; } -static int exynos5420_pm_suspend(void) +static int exynos5420_pm_suspend(void *data) { u32 this_cluster; @@ -408,7 +408,7 @@ static int exynos5420_pm_suspend(void) return 0; } -static void exynos_pm_resume(void) +static void exynos_pm_resume(void *data) { u32 cpuid = read_cpuid_part(); @@ -429,7 +429,7 @@ early_wakeup: exynos_set_delayed_reset_assertion(true); } -static void exynos3250_pm_resume(void) +static void exynos3250_pm_resume(void *data) { u32 cpuid = read_cpuid_part(); @@ -473,7 +473,7 @@ static void exynos5420_prepare_pm_resume(void) } } -static void exynos5420_pm_resume(void) +static void exynos5420_pm_resume(void *data) { unsigned long tmp; @@ -596,41 +596,52 @@ static const struct platform_suspend_ops exynos_suspend_ops = { .valid = suspend_valid_only_mem, }; +static const struct syscore_ops exynos3250_syscore_ops = { + .suspend = exynos_pm_suspend, + .resume = exynos3250_pm_resume, +}; + static const struct exynos_pm_data exynos3250_pm_data = { .wkup_irq = exynos3250_wkup_irq, .wake_disable_mask = ((0xFF << 8) | (0x1F << 1)), - .pm_suspend = exynos_pm_suspend, - .pm_resume = exynos3250_pm_resume, .pm_prepare = exynos3250_pm_prepare, .cpu_suspend = exynos3250_cpu_suspend, + .syscore_ops = &exynos3250_syscore_ops, +}; + +static const struct syscore_ops exynos_syscore_ops = { + .suspend = exynos_pm_suspend, + .resume = exynos_pm_resume, }; static const struct exynos_pm_data exynos4_pm_data = { .wkup_irq = exynos4_wkup_irq, .wake_disable_mask = ((0xFF << 8) | (0x1F << 1)), - .pm_suspend = exynos_pm_suspend, - .pm_resume = exynos_pm_resume, .pm_prepare = exynos_pm_prepare, .cpu_suspend = exynos_cpu_suspend, + .syscore_ops = &exynos_syscore_ops, }; static const struct exynos_pm_data exynos5250_pm_data = { .wkup_irq = exynos5250_wkup_irq, .wake_disable_mask = ((0xFF << 8) | (0x1F << 1)), - .pm_suspend = exynos_pm_suspend, - .pm_resume = exynos_pm_resume, .pm_prepare = exynos_pm_prepare, .cpu_suspend = exynos_cpu_suspend, + .syscore_ops = &exynos_syscore_ops, +}; + +static const struct syscore_ops exynos5420_syscore_ops = { + .resume = exynos5420_pm_resume, + .suspend = exynos5420_pm_suspend, }; static const struct exynos_pm_data exynos5420_pm_data = { .wkup_irq = exynos5250_wkup_irq, .wake_disable_mask = (0x7F << 7) | (0x1F << 1), .pm_resume_prepare = exynos5420_prepare_pm_resume, - .pm_resume = exynos5420_pm_resume, - .pm_suspend = exynos5420_pm_suspend, .pm_prepare = exynos5420_pm_prepare, .cpu_suspend = exynos5420_cpu_suspend, + .syscore_ops = &exynos5420_syscore_ops, }; static const struct of_device_id exynos_pmu_of_device_ids[] __initconst = { @@ -656,7 +667,7 @@ static const struct of_device_id exynos_pmu_of_device_ids[] __initconst = { { /*sentinel*/ }, }; -static struct syscore_ops exynos_pm_syscore_ops; +static struct syscore exynos_pm_syscore; void __init exynos_pm_init(void) { @@ -684,10 +695,9 @@ void __init exynos_pm_init(void) tmp |= pm_data->wake_disable_mask; pmu_raw_writel(tmp, S5P_WAKEUP_MASK); - exynos_pm_syscore_ops.suspend = pm_data->pm_suspend; - exynos_pm_syscore_ops.resume = pm_data->pm_resume; + exynos_pm_syscore.ops = pm_data->syscore_ops; - register_syscore_ops(&exynos_pm_syscore_ops); + register_syscore(&exynos_pm_syscore); suspend_set_ops(&exynos_suspend_ops); /* diff --git a/arch/arm/mach-pxa/generic.h b/arch/arm/mach-pxa/generic.h index c9c2c46ecead..caad4fca8de3 100644 --- a/arch/arm/mach-pxa/generic.h +++ b/arch/arm/mach-pxa/generic.h @@ -34,9 +34,9 @@ extern void __init pxa27x_map_io(void); extern void __init pxa3xx_init_irq(void); extern void __init pxa3xx_map_io(void); -extern struct syscore_ops pxa_irq_syscore_ops; -extern struct syscore_ops pxa2xx_mfp_syscore_ops; -extern struct syscore_ops pxa3xx_mfp_syscore_ops; +extern struct syscore pxa_irq_syscore; +extern struct syscore pxa2xx_mfp_syscore; +extern struct syscore pxa3xx_mfp_syscore; void __init pxa_set_ffuart_info(void *info); void __init pxa_set_btuart_info(void *info); diff --git a/arch/arm/mach-pxa/irq.c b/arch/arm/mach-pxa/irq.c index 5bfce8aa4102..99acebbbf065 100644 --- a/arch/arm/mach-pxa/irq.c +++ b/arch/arm/mach-pxa/irq.c @@ -178,7 +178,7 @@ void __init pxa_init_irq(int irq_nr, int (*fn)(struct irq_data *, unsigned int)) static unsigned long saved_icmr[MAX_INTERNAL_IRQS/32]; static unsigned long saved_ipr[MAX_INTERNAL_IRQS]; -static int pxa_irq_suspend(void) +static int pxa_irq_suspend(void *data) { int i; @@ -197,7 +197,7 @@ static int pxa_irq_suspend(void) return 0; } -static void pxa_irq_resume(void) +static void pxa_irq_resume(void *data) { int i; @@ -219,11 +219,15 @@ static void pxa_irq_resume(void) #define pxa_irq_resume NULL #endif -struct syscore_ops pxa_irq_syscore_ops = { +static const struct syscore_ops pxa_irq_syscore_ops = { .suspend = pxa_irq_suspend, .resume = pxa_irq_resume, }; +struct syscore pxa_irq_syscore = { + .ops = &pxa_irq_syscore_ops, +}; + #ifdef CONFIG_OF static const struct of_device_id intc_ids[] __initconst = { { .compatible = "marvell,pxa-intc", }, diff --git a/arch/arm/mach-pxa/mfp-pxa2xx.c b/arch/arm/mach-pxa/mfp-pxa2xx.c index f5a3d890f682..d1347055fbe4 100644 --- a/arch/arm/mach-pxa/mfp-pxa2xx.c +++ b/arch/arm/mach-pxa/mfp-pxa2xx.c @@ -346,7 +346,7 @@ static unsigned long saved_gpdr[4]; static unsigned long saved_gplr[4]; static unsigned long saved_pgsr[4]; -static int pxa2xx_mfp_suspend(void) +static int pxa2xx_mfp_suspend(void *data) { int i; @@ -385,7 +385,7 @@ static int pxa2xx_mfp_suspend(void) return 0; } -static void pxa2xx_mfp_resume(void) +static void pxa2xx_mfp_resume(void *data) { int i; @@ -404,11 +404,15 @@ static void pxa2xx_mfp_resume(void) #define pxa2xx_mfp_resume NULL #endif -struct syscore_ops pxa2xx_mfp_syscore_ops = { +static const struct syscore_ops pxa2xx_mfp_syscore_ops = { .suspend = pxa2xx_mfp_suspend, .resume = pxa2xx_mfp_resume, }; +struct syscore pxa2xx_mfp_syscore = { + .ops = &pxa2xx_mfp_syscore_ops, +}; + static int __init pxa2xx_mfp_init(void) { int i; diff --git a/arch/arm/mach-pxa/mfp-pxa3xx.c b/arch/arm/mach-pxa/mfp-pxa3xx.c index d16ab7451efe..fe7498fbb62b 100644 --- a/arch/arm/mach-pxa/mfp-pxa3xx.c +++ b/arch/arm/mach-pxa/mfp-pxa3xx.c @@ -27,13 +27,13 @@ * a pull-down mode if they're an active low chip select, and we're * just entering standby. */ -static int pxa3xx_mfp_suspend(void) +static int pxa3xx_mfp_suspend(void *data) { mfp_config_lpm(); return 0; } -static void pxa3xx_mfp_resume(void) +static void pxa3xx_mfp_resume(void *data) { mfp_config_run(); @@ -49,7 +49,11 @@ static void pxa3xx_mfp_resume(void) #define pxa3xx_mfp_resume NULL #endif -struct syscore_ops pxa3xx_mfp_syscore_ops = { +static const struct syscore_ops pxa3xx_mfp_syscore_ops = { .suspend = pxa3xx_mfp_suspend, .resume = pxa3xx_mfp_resume, }; + +struct syscore pxa3xx_mfp_syscore = { + .ops = &pxa3xx_mfp_syscore_ops, +}; diff --git a/arch/arm/mach-pxa/pxa25x.c b/arch/arm/mach-pxa/pxa25x.c index 03e34841fc00..70509a599814 100644 --- a/arch/arm/mach-pxa/pxa25x.c +++ b/arch/arm/mach-pxa/pxa25x.c @@ -235,8 +235,8 @@ static int __init pxa25x_init(void) pxa25x_init_pm(); - register_syscore_ops(&pxa_irq_syscore_ops); - register_syscore_ops(&pxa2xx_mfp_syscore_ops); + register_syscore(&pxa_irq_syscore); + register_syscore(&pxa2xx_mfp_syscore); if (!of_have_populated_dt()) { software_node_register(&pxa2xx_gpiochip_node); diff --git a/arch/arm/mach-pxa/pxa27x.c b/arch/arm/mach-pxa/pxa27x.c index f8382477d629..ff6361979038 100644 --- a/arch/arm/mach-pxa/pxa27x.c +++ b/arch/arm/mach-pxa/pxa27x.c @@ -337,8 +337,8 @@ static int __init pxa27x_init(void) pxa27x_init_pm(); - register_syscore_ops(&pxa_irq_syscore_ops); - register_syscore_ops(&pxa2xx_mfp_syscore_ops); + register_syscore(&pxa_irq_syscore); + register_syscore(&pxa2xx_mfp_syscore); if (!of_have_populated_dt()) { software_node_register(&pxa2xx_gpiochip_node); diff --git a/arch/arm/mach-pxa/pxa3xx.c b/arch/arm/mach-pxa/pxa3xx.c index 1d1e5713464d..06c578ea658e 100644 --- a/arch/arm/mach-pxa/pxa3xx.c +++ b/arch/arm/mach-pxa/pxa3xx.c @@ -424,8 +424,8 @@ static int __init pxa3xx_init(void) if (cpu_is_pxa320()) enable_irq_wake(IRQ_WAKEUP1); - register_syscore_ops(&pxa_irq_syscore_ops); - register_syscore_ops(&pxa3xx_mfp_syscore_ops); + register_syscore(&pxa_irq_syscore); + register_syscore(&pxa3xx_mfp_syscore); } return ret; diff --git a/arch/arm/mach-pxa/smemc.c b/arch/arm/mach-pxa/smemc.c index 2d2a321d82f8..fb93a8f28356 100644 --- a/arch/arm/mach-pxa/smemc.c +++ b/arch/arm/mach-pxa/smemc.c @@ -18,7 +18,7 @@ static unsigned long msc[2]; static unsigned long sxcnfg, memclkcfg; static unsigned long csadrcfg[4]; -static int pxa3xx_smemc_suspend(void) +static int pxa3xx_smemc_suspend(void *data) { msc[0] = __raw_readl(MSC0); msc[1] = __raw_readl(MSC1); @@ -32,7 +32,7 @@ static int pxa3xx_smemc_suspend(void) return 0; } -static void pxa3xx_smemc_resume(void) +static void pxa3xx_smemc_resume(void *data) { __raw_writel(msc[0], MSC0); __raw_writel(msc[1], MSC1); @@ -46,11 +46,15 @@ static void pxa3xx_smemc_resume(void) __raw_writel(0x2, CSMSADRCFG); } -static struct syscore_ops smemc_syscore_ops = { +static const struct syscore_ops smemc_syscore_ops = { .suspend = pxa3xx_smemc_suspend, .resume = pxa3xx_smemc_resume, }; +static struct syscore smemc_syscore = { + .ops = &smemc_syscore_ops, +}; + static int __init smemc_init(void) { if (cpu_is_pxa3xx()) { @@ -64,7 +68,7 @@ static int __init smemc_init(void) */ __raw_writel(0x2, CSMSADRCFG); - register_syscore_ops(&smemc_syscore_ops); + register_syscore(&smemc_syscore); } return 0; diff --git a/arch/arm/mach-s3c/irq-pm-s3c64xx.c b/arch/arm/mach-s3c/irq-pm-s3c64xx.c index 4a1e935bada1..ab726c595001 100644 --- a/arch/arm/mach-s3c/irq-pm-s3c64xx.c +++ b/arch/arm/mach-s3c/irq-pm-s3c64xx.c @@ -58,7 +58,7 @@ static struct irq_grp_save { static u32 irq_uart_mask[SERIAL_SAMSUNG_UARTS]; -static int s3c64xx_irq_pm_suspend(void) +static int s3c64xx_irq_pm_suspend(void *data) { struct irq_grp_save *grp = eint_grp_save; int i; @@ -79,7 +79,7 @@ static int s3c64xx_irq_pm_suspend(void) return 0; } -static void s3c64xx_irq_pm_resume(void) +static void s3c64xx_irq_pm_resume(void *data) { struct irq_grp_save *grp = eint_grp_save; int i; @@ -100,18 +100,22 @@ static void s3c64xx_irq_pm_resume(void) S3C_PMDBG("%s: IRQ configuration restored\n", __func__); } -static struct syscore_ops s3c64xx_irq_syscore_ops = { +static const struct syscore_ops s3c64xx_irq_syscore_ops = { .suspend = s3c64xx_irq_pm_suspend, .resume = s3c64xx_irq_pm_resume, }; +static struct syscore s3c64xx_irq_syscore = { + .ops = &s3c64xx_irq_syscore_ops, +}; + static __init int s3c64xx_syscore_init(void) { /* Appropriate drivers (pinctrl, uart) handle this when using DT. */ if (of_have_populated_dt() || !soc_is_s3c64xx()) return 0; - register_syscore_ops(&s3c64xx_irq_syscore_ops); + register_syscore(&s3c64xx_irq_syscore); return 0; } diff --git a/arch/arm/mach-s5pv210/pm.c b/arch/arm/mach-s5pv210/pm.c index 6fa70f787df4..fa270750364c 100644 --- a/arch/arm/mach-s5pv210/pm.c +++ b/arch/arm/mach-s5pv210/pm.c @@ -195,20 +195,24 @@ static const struct platform_suspend_ops s5pv210_suspend_ops = { /* * Syscore operations used to delay restore of certain registers. */ -static void s5pv210_pm_resume(void) +static void s5pv210_pm_resume(void *data) { s3c_pm_do_restore_core(s5pv210_core_save, ARRAY_SIZE(s5pv210_core_save)); } -static struct syscore_ops s5pv210_pm_syscore_ops = { +static const struct syscore_ops s5pv210_pm_syscore_ops = { .resume = s5pv210_pm_resume, }; +static struct syscore s5pv210_pm_syscore = { + .ops = &s5pv210_pm_syscore_ops, +}; + /* * Initialization entry point. */ void __init s5pv210_pm_init(void) { - register_syscore_ops(&s5pv210_pm_syscore_ops); + register_syscore(&s5pv210_pm_syscore); suspend_set_ops(&s5pv210_suspend_ops); } diff --git a/arch/arm/mach-versatile/integrator_ap.c b/arch/arm/mach-versatile/integrator_ap.c index 4bd6712e9f52..ee90d6619d0d 100644 --- a/arch/arm/mach-versatile/integrator_ap.c +++ b/arch/arm/mach-versatile/integrator_ap.c @@ -63,13 +63,13 @@ static void __init ap_map_io(void) #ifdef CONFIG_PM static unsigned long ic_irq_enable; -static int irq_suspend(void) +static int irq_suspend(void *data) { ic_irq_enable = readl(VA_IC_BASE + IRQ_ENABLE); return 0; } -static void irq_resume(void) +static void irq_resume(void *data) { /* disable all irq sources */ cm_clear_irqs(); @@ -83,14 +83,18 @@ static void irq_resume(void) #define irq_resume NULL #endif -static struct syscore_ops irq_syscore_ops = { +static const struct syscore_ops irq_syscore_ops = { .suspend = irq_suspend, .resume = irq_resume, }; +static struct syscore irq_syscore = { + .ops = &irq_syscore_ops, +}; + static int __init irq_syscore_init(void) { - register_syscore_ops(&irq_syscore_ops); + register_syscore(&irq_syscore); return 0; } diff --git a/arch/arm/mm/cache-b15-rac.c b/arch/arm/mm/cache-b15-rac.c index 6f63b90f9e1a..e7807356dfab 100644 --- a/arch/arm/mm/cache-b15-rac.c +++ b/arch/arm/mm/cache-b15-rac.c @@ -256,7 +256,7 @@ static int b15_rac_dead_cpu(unsigned int cpu) return 0; } -static int b15_rac_suspend(void) +static int b15_rac_suspend(void *data) { /* Suspend the read-ahead cache oeprations, forcing our cache * implementation to fallback to the regular ARMv7 calls. @@ -271,7 +271,7 @@ static int b15_rac_suspend(void) return 0; } -static void b15_rac_resume(void) +static void b15_rac_resume(void *data) { /* Coming out of a S3 suspend/resume cycle, the read-ahead cache * register RAC_CONFIG0_REG will be restored to its default value, make @@ -282,11 +282,15 @@ static void b15_rac_resume(void) clear_bit(RAC_SUSPENDED, &b15_rac_flags); } -static struct syscore_ops b15_rac_syscore_ops = { +static const struct syscore_ops b15_rac_syscore_ops = { .suspend = b15_rac_suspend, .resume = b15_rac_resume, }; +static struct syscore b15_rac_syscore = { + .ops = &b15_rac_syscore_ops, +}; + static int __init b15_rac_init(void) { struct device_node *dn, *cpu_dn; @@ -347,7 +351,7 @@ static int __init b15_rac_init(void) } if (IS_ENABLED(CONFIG_PM_SLEEP)) - register_syscore_ops(&b15_rac_syscore_ops); + register_syscore(&b15_rac_syscore); spin_lock(&rac_lock); reg = __raw_readl(b15_rac_base + RAC_CONFIG0_REG); diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 46036d98da75..8b2fcb3fb874 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -535,28 +535,32 @@ int hibernate_resume_nonboot_cpu_disable(void) */ #ifdef CONFIG_PM -static int loongson_ipi_suspend(void) +static int loongson_ipi_suspend(void *data) { return 0; } -static void loongson_ipi_resume(void) +static void loongson_ipi_resume(void *data) { iocsr_write32(0xffffffff, LOONGARCH_IOCSR_IPI_EN); } -static struct syscore_ops loongson_ipi_syscore_ops = { +static const struct syscore_ops loongson_ipi_syscore_ops = { .resume = loongson_ipi_resume, .suspend = loongson_ipi_suspend, }; +static struct syscore loongson_ipi_syscore = { + .ops = &loongson_ipi_syscore_ops, +}; + /* * Enable boot cpu ipi before enabling nonboot cpus * during syscore_resume. */ static int __init ipi_pm_init(void) { - register_syscore_ops(&loongson_ipi_syscore_ops); + register_syscore(&loongson_ipi_syscore); return 0; } diff --git a/arch/mips/alchemy/common/dbdma.c b/arch/mips/alchemy/common/dbdma.c index 6a3c890f7bbf..6c2c2010bbae 100644 --- a/arch/mips/alchemy/common/dbdma.c +++ b/arch/mips/alchemy/common/dbdma.c @@ -982,7 +982,7 @@ u32 au1xxx_dbdma_put_dscr(u32 chanid, au1x_ddma_desc_t *dscr) static unsigned long alchemy_dbdma_pm_data[NUM_DBDMA_CHANS + 1][6]; -static int alchemy_dbdma_suspend(void) +static int alchemy_dbdma_suspend(void *data) { int i; void __iomem *addr; @@ -1019,7 +1019,7 @@ static int alchemy_dbdma_suspend(void) return 0; } -static void alchemy_dbdma_resume(void) +static void alchemy_dbdma_resume(void *data) { int i; void __iomem *addr; @@ -1044,11 +1044,15 @@ static void alchemy_dbdma_resume(void) } } -static struct syscore_ops alchemy_dbdma_syscore_ops = { +static const struct syscore_ops alchemy_dbdma_syscore_ops = { .suspend = alchemy_dbdma_suspend, .resume = alchemy_dbdma_resume, }; +static struct syscore alchemy_dbdma_syscore = { + .ops = &alchemy_dbdma_syscore_ops, +}; + static int __init dbdma_setup(unsigned int irq, dbdev_tab_t *idtable) { int ret; @@ -1071,7 +1075,7 @@ static int __init dbdma_setup(unsigned int irq, dbdev_tab_t *idtable) printk(KERN_ERR "Cannot grab DBDMA interrupt!\n"); else { dbdma_initialized = 1; - register_syscore_ops(&alchemy_dbdma_syscore_ops); + register_syscore(&alchemy_dbdma_syscore); } return ret; diff --git a/arch/mips/alchemy/common/irq.c b/arch/mips/alchemy/common/irq.c index da9f9220048f..2403afcd2fb9 100644 --- a/arch/mips/alchemy/common/irq.c +++ b/arch/mips/alchemy/common/irq.c @@ -758,7 +758,7 @@ static inline void alchemy_ic_resume_one(void __iomem *base, unsigned long *d) wmb(); } -static int alchemy_ic_suspend(void) +static int alchemy_ic_suspend(void *data) { alchemy_ic_suspend_one((void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR), alchemy_gpic_pmdata); @@ -767,7 +767,7 @@ static int alchemy_ic_suspend(void) return 0; } -static void alchemy_ic_resume(void) +static void alchemy_ic_resume(void *data) { alchemy_ic_resume_one((void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR), &alchemy_gpic_pmdata[7]); @@ -775,7 +775,7 @@ static void alchemy_ic_resume(void) alchemy_gpic_pmdata); } -static int alchemy_gpic_suspend(void) +static int alchemy_gpic_suspend(void *data) { void __iomem *base = (void __iomem *)KSEG1ADDR(AU1300_GPIC_PHYS_ADDR); int i; @@ -806,7 +806,7 @@ static int alchemy_gpic_suspend(void) return 0; } -static void alchemy_gpic_resume(void) +static void alchemy_gpic_resume(void *data) { void __iomem *base = (void __iomem *)KSEG1ADDR(AU1300_GPIC_PHYS_ADDR); int i; @@ -837,16 +837,24 @@ static void alchemy_gpic_resume(void) wmb(); } -static struct syscore_ops alchemy_ic_pmops = { +static const struct syscore_ops alchemy_ic_pmops = { .suspend = alchemy_ic_suspend, .resume = alchemy_ic_resume, }; -static struct syscore_ops alchemy_gpic_pmops = { +static struct syscore alchemy_ic_pm = { + .ops = &alchemy_ic_pmops, +}; + +static const struct syscore_ops alchemy_gpic_pmops = { .suspend = alchemy_gpic_suspend, .resume = alchemy_gpic_resume, }; +static struct syscore alchemy_gpic_pm = { + .ops = &alchemy_gpic_pmops, +}; + /******************************************************************************/ /* create chained handlers for the 4 IC requests to the MIPS IRQ ctrl */ @@ -880,7 +888,7 @@ static void __init au1000_init_irq(struct alchemy_irqmap *map) ic_init((void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR)); ic_init((void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR)); - register_syscore_ops(&alchemy_ic_pmops); + register_syscore(&alchemy_ic_pm); mips_cpu_irq_init(); /* register all 64 possible IC0+IC1 irq sources as type "none". @@ -925,7 +933,7 @@ static void __init alchemy_gpic_init_irq(const struct alchemy_irqmap *dints) int i; void __iomem *bank_base; - register_syscore_ops(&alchemy_gpic_pmops); + register_syscore(&alchemy_gpic_pm); mips_cpu_irq_init(); /* disable & ack all possible interrupt sources */ diff --git a/arch/mips/alchemy/common/usb.c b/arch/mips/alchemy/common/usb.c index 5d618547ebf0..a55f32bf517c 100644 --- a/arch/mips/alchemy/common/usb.c +++ b/arch/mips/alchemy/common/usb.c @@ -580,22 +580,26 @@ static void alchemy_usb_pm(int susp) } } -static int alchemy_usb_suspend(void) +static int alchemy_usb_suspend(void *data) { alchemy_usb_pm(1); return 0; } -static void alchemy_usb_resume(void) +static void alchemy_usb_resume(void *data) { alchemy_usb_pm(0); } -static struct syscore_ops alchemy_usb_pm_ops = { +static const struct syscore_ops alchemy_usb_pm_syscore_ops = { .suspend = alchemy_usb_suspend, .resume = alchemy_usb_resume, }; +static struct syscore alchemy_usb_pm_syscore = { + .ops = &alchemy_usb_pm_syscore_ops, +}; + static int __init alchemy_usb_init(void) { int ret = 0; @@ -620,7 +624,7 @@ static int __init alchemy_usb_init(void) } if (!ret) - register_syscore_ops(&alchemy_usb_pm_ops); + register_syscore(&alchemy_usb_pm_syscore); return ret; } diff --git a/arch/mips/pci/pci-alchemy.c b/arch/mips/pci/pci-alchemy.c index 58625d1b6465..6bfee0f71803 100644 --- a/arch/mips/pci/pci-alchemy.c +++ b/arch/mips/pci/pci-alchemy.c @@ -304,7 +304,7 @@ static int alchemy_pci_def_idsel(unsigned int devsel, int assert) } /* save PCI controller register contents. */ -static int alchemy_pci_suspend(void) +static int alchemy_pci_suspend(void *data) { struct alchemy_pci_context *ctx = __alchemy_pci_ctx; if (!ctx) @@ -326,7 +326,7 @@ static int alchemy_pci_suspend(void) return 0; } -static void alchemy_pci_resume(void) +static void alchemy_pci_resume(void *data) { struct alchemy_pci_context *ctx = __alchemy_pci_ctx; if (!ctx) @@ -354,9 +354,13 @@ static void alchemy_pci_resume(void) alchemy_pci_wired_entry(ctx); /* install it */ } -static struct syscore_ops alchemy_pci_pmops = { - .suspend = alchemy_pci_suspend, - .resume = alchemy_pci_resume, +static const struct syscore_ops alchemy_pci_syscore_ops = { + .suspend = alchemy_pci_suspend, + .resume = alchemy_pci_resume, +}; + +static struct syscore alchemy_pci_syscore = { + .ops = &alchemy_pci_syscore_ops, }; static int alchemy_pci_probe(struct platform_device *pdev) @@ -478,7 +482,7 @@ static int alchemy_pci_probe(struct platform_device *pdev) __alchemy_pci_ctx = ctx; platform_set_drvdata(pdev, ctx); - register_syscore_ops(&alchemy_pci_pmops); + register_syscore(&alchemy_pci_syscore); register_pci_controller(&ctx->alchemy_pci_ctrl); dev_info(&pdev->dev, "PCI controller at %ld MHz\n", diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 2c07387201d0..2ddb93df4817 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -726,7 +726,7 @@ static inline void crash_register_spus(struct list_head *list) } #endif -static void spu_shutdown(void) +static void spu_shutdown(void *data) { struct spu *spu; @@ -738,10 +738,14 @@ static void spu_shutdown(void) mutex_unlock(&spu_full_list_mutex); } -static struct syscore_ops spu_syscore_ops = { +static const struct syscore_ops spu_syscore_ops = { .shutdown = spu_shutdown, }; +static struct syscore spu_syscore = { + .ops = &spu_syscore_ops, +}; + static int __init init_spu_base(void) { int i, ret = 0; @@ -774,7 +778,7 @@ static int __init init_spu_base(void) crash_register_spus(&spu_full_list); mutex_unlock(&spu_full_list_mutex); spu_add_dev_attr(&dev_attr_stat); - register_syscore_ops(&spu_syscore_ops); + register_syscore(&spu_syscore); spu_init_affinity(); diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index c37783a03d25..1959cc13438f 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -600,7 +600,7 @@ not_found: return viaint; } -static int pmacpic_suspend(void) +static int pmacpic_suspend(void *data) { int viaint = pmacpic_find_viaint(); @@ -621,7 +621,7 @@ static int pmacpic_suspend(void) return 0; } -static void pmacpic_resume(void) +static void pmacpic_resume(void *data) { int i; @@ -634,15 +634,19 @@ static void pmacpic_resume(void) pmac_unmask_irq(irq_get_irq_data(i)); } -static struct syscore_ops pmacpic_syscore_ops = { +static const struct syscore_ops pmacpic_syscore_ops = { .suspend = pmacpic_suspend, .resume = pmacpic_resume, }; +static struct syscore pmacpic_syscore = { + .ops = &pmacpic_syscore_ops, +}; + static int __init init_pmacpic_syscore(void) { if (pmac_irq_hw[0]) - register_syscore_ops(&pmacpic_syscore_ops); + register_syscore(&pmacpic_syscore); return 0; } diff --git a/arch/powerpc/sysdev/fsl_lbc.c b/arch/powerpc/sysdev/fsl_lbc.c index 217cea150987..7ed07232a69a 100644 --- a/arch/powerpc/sysdev/fsl_lbc.c +++ b/arch/powerpc/sysdev/fsl_lbc.c @@ -350,7 +350,7 @@ err: #ifdef CONFIG_SUSPEND /* save lbc registers */ -static int fsl_lbc_syscore_suspend(void) +static int fsl_lbc_syscore_suspend(void *data) { struct fsl_lbc_ctrl *ctrl; struct fsl_lbc_regs __iomem *lbc; @@ -374,7 +374,7 @@ out: } /* restore lbc registers */ -static void fsl_lbc_syscore_resume(void) +static void fsl_lbc_syscore_resume(void *data) { struct fsl_lbc_ctrl *ctrl; struct fsl_lbc_regs __iomem *lbc; @@ -408,10 +408,14 @@ static const struct of_device_id fsl_lbc_match[] = { }; #ifdef CONFIG_SUSPEND -static struct syscore_ops lbc_syscore_pm_ops = { +static const struct syscore_ops lbc_syscore_pm_ops = { .suspend = fsl_lbc_syscore_suspend, .resume = fsl_lbc_syscore_resume, }; + +static struct syscore lbc_syscore_pm = { + .ops = &lbc_syscore_pm_ops, +}; #endif static struct platform_driver fsl_lbc_ctrl_driver = { @@ -425,7 +429,7 @@ static struct platform_driver fsl_lbc_ctrl_driver = { static int __init fsl_lbc_init(void) { #ifdef CONFIG_SUSPEND - register_syscore_ops(&lbc_syscore_pm_ops); + register_syscore(&lbc_syscore_pm); #endif return platform_driver_register(&fsl_lbc_ctrl_driver); } diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index ef7707ea0db7..4e501654cb41 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -1258,7 +1258,7 @@ static void fsl_pci_syscore_do_suspend(struct pci_controller *hose) send_pme_turnoff_message(hose); } -static int fsl_pci_syscore_suspend(void) +static int fsl_pci_syscore_suspend(void *data) { struct pci_controller *hose, *tmp; @@ -1291,7 +1291,7 @@ static void fsl_pci_syscore_do_resume(struct pci_controller *hose) setup_pci_atmu(hose); } -static void fsl_pci_syscore_resume(void) +static void fsl_pci_syscore_resume(void *data) { struct pci_controller *hose, *tmp; @@ -1299,10 +1299,14 @@ static void fsl_pci_syscore_resume(void) fsl_pci_syscore_do_resume(hose); } -static struct syscore_ops pci_syscore_pm_ops = { +static const struct syscore_ops pci_syscore_pm_ops = { .suspend = fsl_pci_syscore_suspend, .resume = fsl_pci_syscore_resume, }; + +static struct syscore pci_syscore_pm = { + .ops = &pci_syscore_pm_ops, +}; #endif void fsl_pcibios_fixup_phb(struct pci_controller *phb) @@ -1359,7 +1363,7 @@ static struct platform_driver fsl_pci_driver = { static int __init fsl_pci_init(void) { #ifdef CONFIG_PM_SLEEP - register_syscore_ops(&pci_syscore_pm_ops); + register_syscore(&pci_syscore_pm); #endif return platform_driver_register(&fsl_pci_driver); } diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 70be2105865d..290ba8427239 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -817,7 +817,7 @@ static struct { u32 sercr; } ipic_saved_state; -static int ipic_suspend(void) +static int ipic_suspend(void *data) { struct ipic *ipic = primary_ipic; @@ -848,7 +848,7 @@ static int ipic_suspend(void) return 0; } -static void ipic_resume(void) +static void ipic_resume(void *data) { struct ipic *ipic = primary_ipic; @@ -870,18 +870,22 @@ static void ipic_resume(void) #define ipic_resume NULL #endif -static struct syscore_ops ipic_syscore_ops = { +static const struct syscore_ops ipic_syscore_ops = { .suspend = ipic_suspend, .resume = ipic_resume, }; +static struct syscore ipic_syscore = { + .ops = &ipic_syscore_ops, +}; + static int __init init_ipic_syscore(void) { if (!primary_ipic || !primary_ipic->regs) return -ENODEV; printk(KERN_DEBUG "Registering ipic system core operations\n"); - register_syscore_ops(&ipic_syscore_ops); + register_syscore(&ipic_syscore); return 0; } diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index ad7310bba00b..67e51998d1ae 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -1944,7 +1944,7 @@ static void mpic_suspend_one(struct mpic *mpic) } } -static int mpic_suspend(void) +static int mpic_suspend(void *data) { struct mpic *mpic = mpics; @@ -1986,7 +1986,7 @@ static void mpic_resume_one(struct mpic *mpic) } /* end for loop */ } -static void mpic_resume(void) +static void mpic_resume(void *data) { struct mpic *mpic = mpics; @@ -1996,19 +1996,23 @@ static void mpic_resume(void) } } -static struct syscore_ops mpic_syscore_ops = { +static const struct syscore_ops mpic_syscore_ops = { .resume = mpic_resume, .suspend = mpic_suspend, }; +static struct syscore mpic_syscore = { + .ops = &mpic_syscore_ops, +}; + static int mpic_init_sys(void) { int rc; - register_syscore_ops(&mpic_syscore_ops); + register_syscore(&mpic_syscore); rc = subsys_system_register(&mpic_subsys, NULL); if (rc) { - unregister_syscore_ops(&mpic_syscore_ops); + unregister_syscore(&mpic_syscore); pr_err("mpic: Failed to register subsystem!\n"); return rc; } diff --git a/arch/powerpc/sysdev/mpic_timer.c b/arch/powerpc/sysdev/mpic_timer.c index 7166e2e0baaf..60f5b3934b51 100644 --- a/arch/powerpc/sysdev/mpic_timer.c +++ b/arch/powerpc/sysdev/mpic_timer.c @@ -519,7 +519,7 @@ out: kfree(priv); } -static void mpic_timer_resume(void) +static void mpic_timer_resume(void *data) { struct timer_group_priv *priv; @@ -535,10 +535,14 @@ static const struct of_device_id mpic_timer_ids[] = { {}, }; -static struct syscore_ops mpic_timer_syscore_ops = { +static const struct syscore_ops mpic_timer_syscore_ops = { .resume = mpic_timer_resume, }; +static struct syscore mpic_timer_syscore = { + .ops = &mpic_timer_syscore_ops, +}; + static int __init mpic_timer_init(void) { struct device_node *np = NULL; @@ -546,7 +550,7 @@ static int __init mpic_timer_init(void) for_each_matching_node(np, mpic_timer_ids) timer_group_init(np); - register_syscore_ops(&mpic_timer_syscore_ops); + register_syscore(&mpic_timer_syscore); if (list_empty(&timer_group_list)) return -ENODEV; diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c index 68eb7cc6e564..482eec50f404 100644 --- a/arch/sh/mm/pmb.c +++ b/arch/sh/mm/pmb.c @@ -857,7 +857,7 @@ static int __init pmb_debugfs_init(void) subsys_initcall(pmb_debugfs_init); #ifdef CONFIG_PM -static void pmb_syscore_resume(void) +static void pmb_syscore_resume(void *data) { struct pmb_entry *pmbe; int i; @@ -874,13 +874,17 @@ static void pmb_syscore_resume(void) read_unlock(&pmb_rwlock); } -static struct syscore_ops pmb_syscore_ops = { +static const struct syscore_ops pmb_syscore_ops = { .resume = pmb_syscore_resume, }; +static struct syscore pmb_syscore = { + .ops = &pmb_syscore_ops, +}; + static int __init pmb_sysdev_init(void) { - register_syscore_ops(&pmb_syscore_ops); + register_syscore(&pmb_syscore); return 0; } subsys_initcall(pmb_sysdev_init); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 112f43b23ebf..aca89f23d2e0 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1718,26 +1718,30 @@ static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu) #ifdef CONFIG_PM -static int perf_ibs_suspend(void) +static int perf_ibs_suspend(void *data) { clear_APIC_ibs(); return 0; } -static void perf_ibs_resume(void) +static void perf_ibs_resume(void *data) { ibs_eilvt_setup(); setup_APIC_ibs(); } -static struct syscore_ops perf_ibs_syscore_ops = { +static const struct syscore_ops perf_ibs_syscore_ops = { .resume = perf_ibs_resume, .suspend = perf_ibs_suspend, }; +static struct syscore perf_ibs_syscore = { + .ops = &perf_ibs_syscore_ops, +}; + static void perf_ibs_pm_init(void) { - register_syscore_ops(&perf_ibs_syscore_ops); + register_syscore(&perf_ibs_syscore); } #else diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index e890fd37e9c2..085ef4f2e73a 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -351,7 +351,7 @@ static int __init hv_pci_init(void) return 1; } -static int hv_suspend(void) +static int hv_suspend(void *data) { union hv_x64_msr_hypercall_contents hypercall_msr; int ret; @@ -378,7 +378,7 @@ static int hv_suspend(void) return ret; } -static void hv_resume(void) +static void hv_resume(void *data) { union hv_x64_msr_hypercall_contents hypercall_msr; int ret; @@ -405,11 +405,15 @@ static void hv_resume(void) } /* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */ -static struct syscore_ops hv_syscore_ops = { +static const struct syscore_ops hv_syscore_ops = { .suspend = hv_suspend, .resume = hv_resume, }; +static struct syscore hv_syscore = { + .ops = &hv_syscore_ops, +}; + static void (* __initdata old_setup_percpu_clockev)(void); static void __init hv_stimer_setup_percpu_clockev(void) @@ -569,7 +573,7 @@ skip_hypercall_pg_init: x86_init.pci.arch_init = hv_pci_init; - register_syscore_ops(&hv_syscore_ops); + register_syscore(&hv_syscore); if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID) hv_get_partition_id(); diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 3485d419c2f5..e6e68a31634c 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -591,7 +591,7 @@ static void gart_fixup_northbridges(void) } } -static void gart_resume(void) +static void gart_resume(void *data) { pr_info("PCI-DMA: Resuming GART IOMMU\n"); @@ -600,11 +600,15 @@ static void gart_resume(void) enable_gart_translations(); } -static struct syscore_ops gart_syscore_ops = { +static const struct syscore_ops gart_syscore_ops = { .resume = gart_resume, }; +static struct syscore gart_syscore = { + .ops = &gart_syscore_ops, +}; + /* * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. @@ -650,7 +654,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - register_syscore_ops(&gart_syscore_ops); + register_syscore(&gart_syscore); flush_gart(); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 680d305589a3..fd87b1562c7e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2381,7 +2381,7 @@ static struct { unsigned int apic_cmci; } apic_pm_state; -static int lapic_suspend(void) +static int lapic_suspend(void *data) { unsigned long flags; int maxlvt; @@ -2429,7 +2429,7 @@ static int lapic_suspend(void) return 0; } -static void lapic_resume(void) +static void lapic_resume(void *data) { unsigned int l, h; unsigned long flags; @@ -2504,11 +2504,15 @@ static void lapic_resume(void) * are needed on every CPU up until machine_halt/restart/poweroff. */ -static struct syscore_ops lapic_syscore_ops = { +static const struct syscore_ops lapic_syscore_ops = { .resume = lapic_resume, .suspend = lapic_suspend, }; +static struct syscore lapic_syscore = { + .ops = &lapic_syscore_ops, +}; + static void apic_pm_activate(void) { apic_pm_state.active = 1; @@ -2518,7 +2522,7 @@ static int __init init_lapic_sysfs(void) { /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ if (boot_cpu_has(X86_FEATURE_APIC)) - register_syscore_ops(&lapic_syscore_ops); + register_syscore(&lapic_syscore); return 0; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5ba2feb2c04c..84e200662ce6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2308,7 +2308,12 @@ static void resume_ioapic_id(int ioapic_idx) } } -static void ioapic_resume(void) +static int ioapic_suspend(void *data) +{ + return save_ioapic_entries(); +} + +static void ioapic_resume(void *data) { int ioapic_idx; @@ -2318,14 +2323,18 @@ static void ioapic_resume(void) restore_ioapic_entries(); } -static struct syscore_ops ioapic_syscore_ops = { - .suspend = save_ioapic_entries, +static const struct syscore_ops ioapic_syscore_ops = { + .suspend = ioapic_suspend, .resume = ioapic_resume, }; +static struct syscore ioapic_syscore = { + .ops = &ioapic_syscore_ops, +}; + static int __init ioapic_init_ops(void) { - register_syscore_ops(&ioapic_syscore_ops); + register_syscore(&ioapic_syscore); return 0; } diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index a315b0627dfb..7ffc78d5ebf2 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -37,7 +37,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { .seq = SEQCNT_ZERO(cpu_samples.seq) }; -static void init_counter_refs(void) +static void init_counter_refs(void *data) { u64 aperf, mperf; @@ -289,16 +289,20 @@ out: } #ifdef CONFIG_PM_SLEEP -static struct syscore_ops freq_invariance_syscore_ops = { +static const struct syscore_ops freq_invariance_syscore_ops = { .resume = init_counter_refs, }; -static void register_freq_invariance_syscore_ops(void) +static struct syscore freq_invariance_syscore = { + .ops = &freq_invariance_syscore_ops, +}; + +static void register_freq_invariance_syscore(void) { - register_syscore_ops(&freq_invariance_syscore_ops); + register_syscore(&freq_invariance_syscore); } #else -static inline void register_freq_invariance_syscore_ops(void) {} +static inline void register_freq_invariance_syscore(void) {} #endif static void freq_invariance_enable(void) @@ -308,7 +312,7 @@ static void freq_invariance_enable(void) return; } static_branch_enable_cpuslocked(&arch_scale_freq_key); - register_freq_invariance_syscore_ops(); + register_freq_invariance_syscore(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } @@ -535,7 +539,7 @@ static int __init bp_init_aperfmperf(void) if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return 0; - init_counter_refs(); + init_counter_refs(NULL); bp_init_freq_invariance(); return 0; } @@ -544,5 +548,5 @@ early_initcall(bp_init_aperfmperf); void ap_init_aperfmperf(void) { if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) - init_counter_refs(); + init_counter_refs(NULL); } diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index bc7671f920a7..2c56f8730f59 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -75,7 +75,7 @@ static u8 energ_perf_values[] = { [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE, }; -static int intel_epb_save(void) +static int intel_epb_save(void *data) { u64 epb; @@ -89,7 +89,7 @@ static int intel_epb_save(void) return 0; } -static void intel_epb_restore(void) +static void intel_epb_restore(void *data) { u64 val = this_cpu_read(saved_epb); u64 epb; @@ -114,11 +114,15 @@ static void intel_epb_restore(void) wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); } -static struct syscore_ops intel_epb_syscore_ops = { +static const struct syscore_ops intel_epb_syscore_ops = { .suspend = intel_epb_save, .resume = intel_epb_restore, }; +static struct syscore intel_epb_syscore = { + .ops = &intel_epb_syscore_ops, +}; + static const char * const energy_perf_strings[] = { [EPB_INDEX_PERFORMANCE] = "performance", [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance", @@ -185,7 +189,7 @@ static int intel_epb_online(unsigned int cpu) { struct device *cpu_dev = get_cpu_device(cpu); - intel_epb_restore(); + intel_epb_restore(NULL); if (!cpuhp_tasks_frozen) sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); @@ -199,7 +203,7 @@ static int intel_epb_offline(unsigned int cpu) if (!cpuhp_tasks_frozen) sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); - intel_epb_save(); + intel_epb_save(NULL); return 0; } @@ -230,7 +234,7 @@ static __init int intel_epb_init(void) if (ret < 0) goto err_out_online; - register_syscore_ops(&intel_epb_syscore_ops); + register_syscore(&intel_epb_syscore); return 0; err_out_online: diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 460e90a1a0b1..23bfbc7dfb8e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2410,13 +2410,13 @@ static void vendor_disable_error_reporting(void) mce_disable_error_reporting(); } -static int mce_syscore_suspend(void) +static int mce_syscore_suspend(void *data) { vendor_disable_error_reporting(); return 0; } -static void mce_syscore_shutdown(void) +static void mce_syscore_shutdown(void *data) { vendor_disable_error_reporting(); } @@ -2426,7 +2426,7 @@ static void mce_syscore_shutdown(void) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static void mce_syscore_resume(void) +static void mce_syscore_resume(void *data) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); @@ -2434,12 +2434,16 @@ static void mce_syscore_resume(void) cr4_set_bits(X86_CR4_MCE); } -static struct syscore_ops mce_syscore_ops = { +static const struct syscore_ops mce_syscore_ops = { .suspend = mce_syscore_suspend, .shutdown = mce_syscore_shutdown, .resume = mce_syscore_resume, }; +static struct syscore mce_syscore = { + .ops = &mce_syscore_ops, +}; + /* * mce_device: Sysfs support */ @@ -2840,7 +2844,7 @@ static __init int mcheck_init_device(void) if (err < 0) goto err_out_online; - register_syscore_ops(&mce_syscore_ops); + register_syscore(&mce_syscore); return 0; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index f75c140906d0..3262756f8c32 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -812,8 +812,17 @@ void microcode_bsp_resume(void) reload_early_microcode(cpu); } -static struct syscore_ops mc_syscore_ops = { - .resume = microcode_bsp_resume, +static void microcode_bsp_syscore_resume(void *data) +{ + microcode_bsp_resume(); +} + +static const struct syscore_ops mc_syscore_ops = { + .resume = microcode_bsp_syscore_resume, +}; + +static struct syscore mc_syscore = { + .ops = &mc_syscore_ops, }; static int mc_cpu_online(unsigned int cpu) @@ -892,7 +901,7 @@ static int __init microcode_init(void) } } - register_syscore_ops(&mc_syscore_ops); + register_syscore(&mc_syscore); cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c index d25882fcf181..2415ffaaf02c 100644 --- a/arch/x86/kernel/cpu/mtrr/legacy.c +++ b/arch/x86/kernel/cpu/mtrr/legacy.c @@ -41,7 +41,7 @@ struct mtrr_value { static struct mtrr_value *mtrr_value; -static int mtrr_save(void) +static int mtrr_save(void *data) { int i; @@ -56,7 +56,7 @@ static int mtrr_save(void) return 0; } -static void mtrr_restore(void) +static void mtrr_restore(void *data) { int i; @@ -69,11 +69,15 @@ static void mtrr_restore(void) } } -static struct syscore_ops mtrr_syscore_ops = { +static const struct syscore_ops mtrr_syscore_ops = { .suspend = mtrr_save, .resume = mtrr_restore, }; +static struct syscore mtrr_syscore = { + .ops = &mtrr_syscore_ops, +}; + void mtrr_register_syscore(void) { mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL); @@ -86,5 +90,5 @@ void mtrr_register_syscore(void) * TBD: is there any system with such CPU which supports * suspend/resume? If no, we should remove the code. */ - register_syscore_ops(&mtrr_syscore_ops); + register_syscore(&mtrr_syscore); } diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 933fcd7ff250..e4a31c536642 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -86,15 +86,19 @@ static int umwait_cpu_offline(unsigned int cpu) * trust the firmware nor does it matter if the same value is written * again. */ -static void umwait_syscore_resume(void) +static void umwait_syscore_resume(void *data) { umwait_update_control_msr(NULL); } -static struct syscore_ops umwait_syscore_ops = { +static const struct syscore_ops umwait_syscore_ops = { .resume = umwait_syscore_resume, }; +static struct syscore umwait_syscore = { + .ops = &umwait_syscore_ops, +}; + /* sysfs interface */ /* @@ -226,7 +230,7 @@ static int __init umwait_init(void) return ret; } - register_syscore_ops(&umwait_syscore_ops); + register_syscore(&umwait_syscore); /* * Add umwait control interface. Ignore failure, so at least the diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c index 2cd124ad9380..896d46b44284 100644 --- a/arch/x86/kernel/i8237.c +++ b/arch/x86/kernel/i8237.c @@ -19,7 +19,7 @@ * in asm/dma.h. */ -static void i8237A_resume(void) +static void i8237A_resume(void *data) { unsigned long flags; int i; @@ -41,10 +41,14 @@ static void i8237A_resume(void) release_dma_lock(flags); } -static struct syscore_ops i8237_syscore_ops = { +static const struct syscore_ops i8237_syscore_ops = { .resume = i8237A_resume, }; +static struct syscore i8237_syscore = { + .ops = &i8237_syscore_ops, +}; + static int __init i8237A_init_ops(void) { /* @@ -70,7 +74,7 @@ static int __init i8237A_init_ops(void) if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017) return -ENODEV; - register_syscore_ops(&i8237_syscore_ops); + register_syscore(&i8237_syscore); return 0; } device_initcall(i8237A_init_ops); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 2bade73f49e3..f67063df6723 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -247,19 +247,19 @@ static void save_ELCR(char *trigger) trigger[1] = inb(PIC_ELCR2) & 0xDE; } -static void i8259A_resume(void) +static void i8259A_resume(void *data) { init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); } -static int i8259A_suspend(void) +static int i8259A_suspend(void *data) { save_ELCR(irq_trigger); return 0; } -static void i8259A_shutdown(void) +static void i8259A_shutdown(void *data) { /* Put the i8259A into a quiescent state that * the kernel initialization code can get it @@ -269,12 +269,16 @@ static void i8259A_shutdown(void) outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ } -static struct syscore_ops i8259_syscore_ops = { +static const struct syscore_ops i8259_syscore_ops = { .suspend = i8259A_suspend, .resume = i8259A_resume, .shutdown = i8259A_shutdown, }; +static struct syscore i8259_syscore = { + .ops = &i8259_syscore_ops, +}; + static void mask_8259A(void) { unsigned long flags; @@ -444,7 +448,7 @@ EXPORT_SYMBOL(legacy_pic); static int __init i8259A_init_ops(void) { if (legacy_pic == &default_legacy_pic) - register_syscore_ops(&i8259_syscore_ops); + register_syscore(&i8259_syscore); return 0; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b67d7c59dca0..1500852ba03c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -720,7 +720,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) #endif -static int kvm_suspend(void) +static int kvm_suspend(void *data) { u64 val = 0; @@ -734,7 +734,7 @@ static int kvm_suspend(void) return 0; } -static void kvm_resume(void) +static void kvm_resume(void *data) { kvm_cpu_online(raw_smp_processor_id()); @@ -744,11 +744,15 @@ static void kvm_resume(void) #endif } -static struct syscore_ops kvm_syscore_ops = { +static const struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, }; +static struct syscore kvm_syscore = { + .ops = &kvm_syscore_ops, +}; + static void kvm_pv_guest_cpu_reboot(void *unused) { kvm_guest_cpu_offline(true); @@ -858,7 +862,7 @@ static void __init kvm_guest_init(void) machine_ops.crash_shutdown = kvm_crash_shutdown; #endif - register_syscore_ops(&kvm_syscore_ops); + register_syscore(&kvm_syscore); /* * Hard lockup detection is enabled by default. Disable it, as guests diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index e4560b33b8ad..bed7dc85612e 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -761,7 +761,7 @@ static int acpi_pci_link_resume(struct acpi_pci_link *link) return 0; } -static void irqrouter_resume(void) +static void irqrouter_resume(void *data) { struct acpi_pci_link *link; @@ -888,10 +888,14 @@ static int __init acpi_irq_balance_set(char *str) __setup("acpi_irq_balance", acpi_irq_balance_set); -static struct syscore_ops irqrouter_syscore_ops = { +static const struct syscore_ops irqrouter_syscore_ops = { .resume = irqrouter_resume, }; +static struct syscore irqrouter_syscore = { + .ops = &irqrouter_syscore_ops, +}; + void __init acpi_pci_link_init(void) { if (acpi_noirq) @@ -904,6 +908,6 @@ void __init acpi_pci_link_init(void) else acpi_irq_balance = 0; } - register_syscore_ops(&irqrouter_syscore_ops); + register_syscore(&irqrouter_syscore); acpi_scan_add_handler(&pci_link_handler); } diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index c8ee8e42b0f6..aaf57d0aaa19 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -884,13 +884,13 @@ bool acpi_s2idle_wakeup(void) #ifdef CONFIG_PM_SLEEP static u32 saved_bm_rld; -static int acpi_save_bm_rld(void) +static int acpi_save_bm_rld(void *data) { acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_RLD, &saved_bm_rld); return 0; } -static void acpi_restore_bm_rld(void) +static void acpi_restore_bm_rld(void *data) { u32 resumed_bm_rld = 0; @@ -901,14 +901,18 @@ static void acpi_restore_bm_rld(void) acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, saved_bm_rld); } -static struct syscore_ops acpi_sleep_syscore_ops = { +static const struct syscore_ops acpi_sleep_syscore_ops = { .suspend = acpi_save_bm_rld, .resume = acpi_restore_bm_rld, }; +static struct syscore acpi_sleep_syscore = { + .ops = &acpi_sleep_syscore_ops, +}; + static void acpi_sleep_syscore_init(void) { - register_syscore_ops(&acpi_sleep_syscore_ops); + register_syscore(&acpi_sleep_syscore); } #else static inline void acpi_sleep_syscore_init(void) {} diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 6942c62fa59d..8191dbab92c4 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -1585,16 +1585,20 @@ static int fw_pm_notify(struct notifier_block *notify_block, } /* stop caching firmware once syscore_suspend is reached */ -static int fw_suspend(void) +static int fw_suspend(void *data) { fw_cache.state = FW_LOADER_NO_CACHE; return 0; } -static struct syscore_ops fw_syscore_ops = { +static const struct syscore_ops fw_syscore_ops = { .suspend = fw_suspend, }; +static struct syscore fw_syscore = { + .ops = &fw_syscore_ops, +}; + static int __init register_fw_pm_ops(void) { int ret; @@ -1610,14 +1614,14 @@ static int __init register_fw_pm_ops(void) if (ret) return ret; - register_syscore_ops(&fw_syscore_ops); + register_syscore(&fw_syscore); return ret; } static inline void unregister_fw_pm_ops(void) { - unregister_syscore_ops(&fw_syscore_ops); + unregister_syscore(&fw_syscore); unregister_pm_notifier(&fw_cache.pm_notify); } #else diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c index 13db1f78d2ce..483adb796654 100644 --- a/drivers/base/syscore.c +++ b/drivers/base/syscore.c @@ -11,32 +11,32 @@ #include #include -static LIST_HEAD(syscore_ops_list); -static DEFINE_MUTEX(syscore_ops_lock); +static LIST_HEAD(syscore_list); +static DEFINE_MUTEX(syscore_lock); /** - * register_syscore_ops - Register a set of system core operations. - * @ops: System core operations to register. + * register_syscore - Register a set of system core operations. + * @syscore: System core operations to register. */ -void register_syscore_ops(struct syscore_ops *ops) +void register_syscore(struct syscore *syscore) { - mutex_lock(&syscore_ops_lock); - list_add_tail(&ops->node, &syscore_ops_list); - mutex_unlock(&syscore_ops_lock); + mutex_lock(&syscore_lock); + list_add_tail(&syscore->node, &syscore_list); + mutex_unlock(&syscore_lock); } -EXPORT_SYMBOL_GPL(register_syscore_ops); +EXPORT_SYMBOL_GPL(register_syscore); /** - * unregister_syscore_ops - Unregister a set of system core operations. - * @ops: System core operations to unregister. + * unregister_syscore - Unregister a set of system core operations. + * @syscore: System core operations to unregister. */ -void unregister_syscore_ops(struct syscore_ops *ops) +void unregister_syscore(struct syscore *syscore) { - mutex_lock(&syscore_ops_lock); - list_del(&ops->node); - mutex_unlock(&syscore_ops_lock); + mutex_lock(&syscore_lock); + list_del(&syscore->node); + mutex_unlock(&syscore_lock); } -EXPORT_SYMBOL_GPL(unregister_syscore_ops); +EXPORT_SYMBOL_GPL(unregister_syscore); #ifdef CONFIG_PM_SLEEP /** @@ -46,7 +46,7 @@ EXPORT_SYMBOL_GPL(unregister_syscore_ops); */ int syscore_suspend(void) { - struct syscore_ops *ops; + struct syscore *syscore; int ret = 0; trace_suspend_resume(TPS("syscore_suspend"), 0, true); @@ -59,25 +59,27 @@ int syscore_suspend(void) WARN_ONCE(!irqs_disabled(), "Interrupts enabled before system core suspend.\n"); - list_for_each_entry_reverse(ops, &syscore_ops_list, node) - if (ops->suspend) { - pm_pr_dbg("Calling %pS\n", ops->suspend); - ret = ops->suspend(); + list_for_each_entry_reverse(syscore, &syscore_list, node) + if (syscore->ops->suspend) { + pm_pr_dbg("Calling %pS\n", syscore->ops->suspend); + ret = syscore->ops->suspend(syscore->data); if (ret) goto err_out; WARN_ONCE(!irqs_disabled(), - "Interrupts enabled after %pS\n", ops->suspend); + "Interrupts enabled after %pS\n", + syscore->ops->suspend); } trace_suspend_resume(TPS("syscore_suspend"), 0, false); return 0; err_out: - pr_err("PM: System core suspend callback %pS failed.\n", ops->suspend); + pr_err("PM: System core suspend callback %pS failed.\n", + syscore->ops->suspend); - list_for_each_entry_continue(ops, &syscore_ops_list, node) - if (ops->resume) - ops->resume(); + list_for_each_entry_continue(syscore, &syscore_list, node) + if (syscore->ops->resume) + syscore->ops->resume(syscore->data); return ret; } @@ -90,18 +92,19 @@ EXPORT_SYMBOL_GPL(syscore_suspend); */ void syscore_resume(void) { - struct syscore_ops *ops; + struct syscore *syscore; trace_suspend_resume(TPS("syscore_resume"), 0, true); WARN_ONCE(!irqs_disabled(), "Interrupts enabled before system core resume.\n"); - list_for_each_entry(ops, &syscore_ops_list, node) - if (ops->resume) { - pm_pr_dbg("Calling %pS\n", ops->resume); - ops->resume(); + list_for_each_entry(syscore, &syscore_list, node) + if (syscore->ops->resume) { + pm_pr_dbg("Calling %pS\n", syscore->ops->resume); + syscore->ops->resume(syscore->data); WARN_ONCE(!irqs_disabled(), - "Interrupts enabled after %pS\n", ops->resume); + "Interrupts enabled after %pS\n", + syscore->ops->resume); } trace_suspend_resume(TPS("syscore_resume"), 0, false); } @@ -113,16 +116,17 @@ EXPORT_SYMBOL_GPL(syscore_resume); */ void syscore_shutdown(void) { - struct syscore_ops *ops; + struct syscore *syscore; - mutex_lock(&syscore_ops_lock); + mutex_lock(&syscore_lock); - list_for_each_entry_reverse(ops, &syscore_ops_list, node) - if (ops->shutdown) { + list_for_each_entry_reverse(syscore, &syscore_list, node) + if (syscore->ops->shutdown) { if (initcall_debug) - pr_info("PM: Calling %pS\n", ops->shutdown); - ops->shutdown(); + pr_info("PM: Calling %pS\n", + syscore->ops->shutdown); + syscore->ops->shutdown(syscore->data); } - mutex_unlock(&syscore_ops_lock); + mutex_unlock(&syscore_lock); } diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c index 00cb792bda18..dd94145c9b22 100644 --- a/drivers/bus/mvebu-mbus.c +++ b/drivers/bus/mvebu-mbus.c @@ -1006,7 +1006,7 @@ static __init int mvebu_mbus_debugfs_init(void) } fs_initcall(mvebu_mbus_debugfs_init); -static int mvebu_mbus_suspend(void) +static int mvebu_mbus_suspend(void *data) { struct mvebu_mbus_state *s = &mbus_state; int win; @@ -1040,7 +1040,7 @@ static int mvebu_mbus_suspend(void) return 0; } -static void mvebu_mbus_resume(void) +static void mvebu_mbus_resume(void *data) { struct mvebu_mbus_state *s = &mbus_state; int win; @@ -1069,9 +1069,13 @@ static void mvebu_mbus_resume(void) } } -static struct syscore_ops mvebu_mbus_syscore_ops = { - .suspend = mvebu_mbus_suspend, - .resume = mvebu_mbus_resume, +static const struct syscore_ops mvebu_mbus_syscore_ops = { + .suspend = mvebu_mbus_suspend, + .resume = mvebu_mbus_resume, +}; + +static struct syscore mvebu_mbus_syscore = { + .ops = &mvebu_mbus_syscore_ops, }; static int __init mvebu_mbus_common_init(struct mvebu_mbus_state *mbus, @@ -1118,7 +1122,7 @@ static int __init mvebu_mbus_common_init(struct mvebu_mbus_state *mbus, writel(UNIT_SYNC_BARRIER_ALL, mbus->mbuswins_base + UNIT_SYNC_BARRIER_OFF); - register_syscore_ops(&mvebu_mbus_syscore_ops); + register_syscore(&mvebu_mbus_syscore); return 0; } diff --git a/drivers/clk/at91/pmc.c b/drivers/clk/at91/pmc.c index acf780a81589..2310f6f73162 100644 --- a/drivers/clk/at91/pmc.c +++ b/drivers/clk/at91/pmc.c @@ -115,7 +115,7 @@ struct pmc_data *pmc_data_allocate(unsigned int ncore, unsigned int nsystem, /* Address in SECURAM that say if we suspend to backup mode. */ static void __iomem *at91_pmc_backup_suspend; -static int at91_pmc_suspend(void) +static int at91_pmc_suspend(void *data) { unsigned int backup; @@ -129,7 +129,7 @@ static int at91_pmc_suspend(void) return clk_save_context(); } -static void at91_pmc_resume(void) +static void at91_pmc_resume(void *data) { unsigned int backup; @@ -143,11 +143,15 @@ static void at91_pmc_resume(void) clk_restore_context(); } -static struct syscore_ops pmc_syscore_ops = { +static const struct syscore_ops pmc_syscore_ops = { .suspend = at91_pmc_suspend, .resume = at91_pmc_resume, }; +static struct syscore pmc_syscore = { + .ops = &pmc_syscore_ops, +}; + static const struct of_device_id pmc_dt_ids[] = { { .compatible = "atmel,sama5d2-pmc" }, { .compatible = "microchip,sama7g5-pmc", }, @@ -185,7 +189,7 @@ static int __init pmc_register_ops(void) return -ENOMEM; } - register_syscore_ops(&pmc_syscore_ops); + register_syscore(&pmc_syscore); return 0; } diff --git a/drivers/clk/imx/clk-vf610.c b/drivers/clk/imx/clk-vf610.c index 9e11f1c7c397..41eb38552a9c 100644 --- a/drivers/clk/imx/clk-vf610.c +++ b/drivers/clk/imx/clk-vf610.c @@ -139,7 +139,7 @@ static struct clk * __init vf610_get_fixed_clock( return clk; }; -static int vf610_clk_suspend(void) +static int vf610_clk_suspend(void *data) { int i; @@ -156,7 +156,7 @@ static int vf610_clk_suspend(void) return 0; } -static void vf610_clk_resume(void) +static void vf610_clk_resume(void *data) { int i; @@ -171,11 +171,15 @@ static void vf610_clk_resume(void) writel_relaxed(ccgr[i], CCM_CCGRx(i)); } -static struct syscore_ops vf610_clk_syscore_ops = { +static const struct syscore_ops vf610_clk_syscore_ops = { .suspend = vf610_clk_suspend, .resume = vf610_clk_resume, }; +static struct syscore vf610_clk_syscore = { + .ops = &vf610_clk_syscore_ops, +}; + static void __init vf610_clocks_init(struct device_node *ccm_node) { struct device_node *np; @@ -462,7 +466,7 @@ static void __init vf610_clocks_init(struct device_node *ccm_node) for (i = 0; i < ARRAY_SIZE(clks_init_on); i++) clk_prepare_enable(clk[clks_init_on[i]]); - register_syscore_ops(&vf610_clk_syscore_ops); + register_syscore(&vf610_clk_syscore); /* Add the clocks to provider list */ clk_data.clks = clk; diff --git a/drivers/clk/ingenic/jz4725b-cgu.c b/drivers/clk/ingenic/jz4725b-cgu.c index 590e9c85cb25..94cee44c854f 100644 --- a/drivers/clk/ingenic/jz4725b-cgu.c +++ b/drivers/clk/ingenic/jz4725b-cgu.c @@ -268,6 +268,6 @@ static void __init jz4725b_cgu_init(struct device_node *np) if (retval) pr_err("%s: failed to register CGU Clocks\n", __func__); - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } CLK_OF_DECLARE_DRIVER(jz4725b_cgu, "ingenic,jz4725b-cgu", jz4725b_cgu_init); diff --git a/drivers/clk/ingenic/jz4740-cgu.c b/drivers/clk/ingenic/jz4740-cgu.c index 3e0a30574ebb..2def3aedc8dd 100644 --- a/drivers/clk/ingenic/jz4740-cgu.c +++ b/drivers/clk/ingenic/jz4740-cgu.c @@ -266,6 +266,6 @@ static void __init jz4740_cgu_init(struct device_node *np) if (retval) pr_err("%s: failed to register CGU Clocks\n", __func__); - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } CLK_OF_DECLARE_DRIVER(jz4740_cgu, "ingenic,jz4740-cgu", jz4740_cgu_init); diff --git a/drivers/clk/ingenic/jz4755-cgu.c b/drivers/clk/ingenic/jz4755-cgu.c index f2c2d848dab7..17cf5dcaece9 100644 --- a/drivers/clk/ingenic/jz4755-cgu.c +++ b/drivers/clk/ingenic/jz4755-cgu.c @@ -337,7 +337,7 @@ static void __init jz4755_cgu_init(struct device_node *np) if (retval) pr_err("%s: failed to register CGU Clocks\n", __func__); - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } /* * CGU has some children devices, this is useful for probing children devices diff --git a/drivers/clk/ingenic/jz4760-cgu.c b/drivers/clk/ingenic/jz4760-cgu.c index e407f00bd594..372fe4b07992 100644 --- a/drivers/clk/ingenic/jz4760-cgu.c +++ b/drivers/clk/ingenic/jz4760-cgu.c @@ -436,7 +436,7 @@ static void __init jz4760_cgu_init(struct device_node *np) if (retval) pr_err("%s: failed to register CGU Clocks\n", __func__); - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } /* We only probe via devicetree, no need for a platform driver */ diff --git a/drivers/clk/ingenic/jz4770-cgu.c b/drivers/clk/ingenic/jz4770-cgu.c index 6ae1740367f9..58f1d3bad677 100644 --- a/drivers/clk/ingenic/jz4770-cgu.c +++ b/drivers/clk/ingenic/jz4770-cgu.c @@ -456,7 +456,7 @@ static void __init jz4770_cgu_init(struct device_node *np) if (retval) pr_err("%s: failed to register CGU Clocks\n", __func__); - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } /* We only probe via devicetree, no need for a platform driver */ diff --git a/drivers/clk/ingenic/jz4780-cgu.c b/drivers/clk/ingenic/jz4780-cgu.c index 07e2f3c5c454..1e88aef7ac0f 100644 --- a/drivers/clk/ingenic/jz4780-cgu.c +++ b/drivers/clk/ingenic/jz4780-cgu.c @@ -803,6 +803,6 @@ static void __init jz4780_cgu_init(struct device_node *np) return; } - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } CLK_OF_DECLARE_DRIVER(jz4780_cgu, "ingenic,jz4780-cgu", jz4780_cgu_init); diff --git a/drivers/clk/ingenic/pm.c b/drivers/clk/ingenic/pm.c index 341752b640d2..206d5cf2872f 100644 --- a/drivers/clk/ingenic/pm.c +++ b/drivers/clk/ingenic/pm.c @@ -15,7 +15,7 @@ static void __iomem * __maybe_unused ingenic_cgu_base; -static int __maybe_unused ingenic_cgu_pm_suspend(void) +static int __maybe_unused ingenic_cgu_pm_suspend(void *data) { u32 val = readl(ingenic_cgu_base + CGU_REG_LCR); @@ -24,22 +24,26 @@ static int __maybe_unused ingenic_cgu_pm_suspend(void) return 0; } -static void __maybe_unused ingenic_cgu_pm_resume(void) +static void __maybe_unused ingenic_cgu_pm_resume(void *data) { u32 val = readl(ingenic_cgu_base + CGU_REG_LCR); writel(val & ~LCR_LOW_POWER_MODE, ingenic_cgu_base + CGU_REG_LCR); } -static struct syscore_ops __maybe_unused ingenic_cgu_pm_ops = { +static const struct syscore_ops __maybe_unused ingenic_cgu_pm_ops = { .suspend = ingenic_cgu_pm_suspend, .resume = ingenic_cgu_pm_resume, }; -void ingenic_cgu_register_syscore_ops(struct ingenic_cgu *cgu) +static struct syscore __maybe_unused ingenic_cgu_pm = { + .ops = &ingenic_cgu_pm_ops, +}; + +void ingenic_cgu_register_syscore(struct ingenic_cgu *cgu) { if (IS_ENABLED(CONFIG_PM_SLEEP)) { ingenic_cgu_base = cgu->base; - register_syscore_ops(&ingenic_cgu_pm_ops); + register_syscore(&ingenic_cgu_pm); } } diff --git a/drivers/clk/ingenic/pm.h b/drivers/clk/ingenic/pm.h index fa7540407b6b..0dcb57dc64cb 100644 --- a/drivers/clk/ingenic/pm.h +++ b/drivers/clk/ingenic/pm.h @@ -7,6 +7,6 @@ struct ingenic_cgu; -void ingenic_cgu_register_syscore_ops(struct ingenic_cgu *cgu); +void ingenic_cgu_register_syscore(struct ingenic_cgu *cgu); #endif /* DRIVERS_CLK_INGENIC_PM_H */ diff --git a/drivers/clk/ingenic/tcu.c b/drivers/clk/ingenic/tcu.c index 7d04ef40b7cf..bc6a51da2072 100644 --- a/drivers/clk/ingenic/tcu.c +++ b/drivers/clk/ingenic/tcu.c @@ -455,7 +455,7 @@ err_free_tcu: return ret; } -static int __maybe_unused tcu_pm_suspend(void) +static int __maybe_unused tcu_pm_suspend(void *data) { struct ingenic_tcu *tcu = ingenic_tcu; @@ -465,7 +465,7 @@ static int __maybe_unused tcu_pm_suspend(void) return 0; } -static void __maybe_unused tcu_pm_resume(void) +static void __maybe_unused tcu_pm_resume(void *data) { struct ingenic_tcu *tcu = ingenic_tcu; @@ -473,11 +473,15 @@ static void __maybe_unused tcu_pm_resume(void) clk_enable(tcu->clk); } -static struct syscore_ops __maybe_unused tcu_pm_ops = { +static const struct syscore_ops __maybe_unused tcu_pm_ops = { .suspend = tcu_pm_suspend, .resume = tcu_pm_resume, }; +static struct syscore __maybe_unused tcu_pm = { + .ops = &tcu_pm_ops, +}; + static void __init ingenic_tcu_init(struct device_node *np) { int ret = ingenic_tcu_probe(np); @@ -486,7 +490,7 @@ static void __init ingenic_tcu_init(struct device_node *np) pr_crit("Failed to initialize TCU clocks: %d\n", ret); if (IS_ENABLED(CONFIG_PM_SLEEP)) - register_syscore_ops(&tcu_pm_ops); + register_syscore(&tcu_pm); } CLK_OF_DECLARE_DRIVER(jz4740_cgu, "ingenic,jz4740-tcu", ingenic_tcu_init); diff --git a/drivers/clk/ingenic/x1000-cgu.c b/drivers/clk/ingenic/x1000-cgu.c index d80886caf393..d89bdfb7c219 100644 --- a/drivers/clk/ingenic/x1000-cgu.c +++ b/drivers/clk/ingenic/x1000-cgu.c @@ -556,7 +556,7 @@ static void __init x1000_cgu_init(struct device_node *np) return; } - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } /* * CGU has some children devices, this is useful for probing children devices diff --git a/drivers/clk/ingenic/x1830-cgu.c b/drivers/clk/ingenic/x1830-cgu.c index 0fd46e50a513..acf856e5009e 100644 --- a/drivers/clk/ingenic/x1830-cgu.c +++ b/drivers/clk/ingenic/x1830-cgu.c @@ -463,7 +463,7 @@ static void __init x1830_cgu_init(struct device_node *np) return; } - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } /* * CGU has some children devices, this is useful for probing children devices diff --git a/drivers/clk/mvebu/common.c b/drivers/clk/mvebu/common.c index 785dbede4835..5adbbd91a6db 100644 --- a/drivers/clk/mvebu/common.c +++ b/drivers/clk/mvebu/common.c @@ -215,22 +215,26 @@ static struct clk *clk_gating_get_src( return ERR_PTR(-ENODEV); } -static int mvebu_clk_gating_suspend(void) +static int mvebu_clk_gating_suspend(void *data) { ctrl->saved_reg = readl(ctrl->base); return 0; } -static void mvebu_clk_gating_resume(void) +static void mvebu_clk_gating_resume(void *data) { writel(ctrl->saved_reg, ctrl->base); } -static struct syscore_ops clk_gate_syscore_ops = { +static const struct syscore_ops clk_gate_syscore_ops = { .suspend = mvebu_clk_gating_suspend, .resume = mvebu_clk_gating_resume, }; +static struct syscore clk_gate_syscore = { + .ops = &clk_gate_syscore_ops, +}; + void __init mvebu_clk_gating_setup(struct device_node *np, const struct clk_gating_soc_desc *desc) { @@ -284,7 +288,7 @@ void __init mvebu_clk_gating_setup(struct device_node *np, of_clk_add_provider(np, clk_gating_get_src, ctrl); - register_syscore_ops(&clk_gate_syscore_ops); + register_syscore(&clk_gate_syscore); return; gates_out: diff --git a/drivers/clk/rockchip/clk-rk3288.c b/drivers/clk/rockchip/clk-rk3288.c index 0a1e017df7c6..9cf3e1e43b78 100644 --- a/drivers/clk/rockchip/clk-rk3288.c +++ b/drivers/clk/rockchip/clk-rk3288.c @@ -871,7 +871,7 @@ static const int rk3288_saved_cru_reg_ids[] = { static u32 rk3288_saved_cru_regs[ARRAY_SIZE(rk3288_saved_cru_reg_ids)]; -static int rk3288_clk_suspend(void) +static int rk3288_clk_suspend(void *data) { int i, reg_id; @@ -906,7 +906,7 @@ static int rk3288_clk_suspend(void) return 0; } -static void rk3288_clk_resume(void) +static void rk3288_clk_resume(void *data) { int i, reg_id; @@ -923,11 +923,15 @@ static void rk3288_clk_shutdown(void) writel_relaxed(0xf3030000, rk3288_cru_base + RK3288_MODE_CON); } -static struct syscore_ops rk3288_clk_syscore_ops = { +static const struct syscore_ops rk3288_clk_syscore_ops = { .suspend = rk3288_clk_suspend, .resume = rk3288_clk_resume, }; +static struct syscore rk3288_clk_syscore = { + .ops = &rk3288_clk_syscore_ops, +}; + static void __init rk3288_common_init(struct device_node *np, enum rk3288_variant soc) { @@ -976,7 +980,7 @@ static void __init rk3288_common_init(struct device_node *np, rockchip_register_restart_notifier(ctx, RK3288_GLB_SRST_FST, rk3288_clk_shutdown); - register_syscore_ops(&rk3288_clk_syscore_ops); + register_syscore(&rk3288_clk_syscore); rockchip_clk_of_add_provider(np, ctx); } diff --git a/drivers/clk/samsung/clk-s5pv210-audss.c b/drivers/clk/samsung/clk-s5pv210-audss.c index b1fd8fac3a4c..c9fcb23de183 100644 --- a/drivers/clk/samsung/clk-s5pv210-audss.c +++ b/drivers/clk/samsung/clk-s5pv210-audss.c @@ -36,7 +36,7 @@ static unsigned long reg_save[][2] = { {ASS_CLK_GATE, 0}, }; -static int s5pv210_audss_clk_suspend(void) +static int s5pv210_audss_clk_suspend(void *data) { int i; @@ -46,7 +46,7 @@ static int s5pv210_audss_clk_suspend(void) return 0; } -static void s5pv210_audss_clk_resume(void) +static void s5pv210_audss_clk_resume(void *data) { int i; @@ -54,10 +54,14 @@ static void s5pv210_audss_clk_resume(void) writel(reg_save[i][1], reg_base + reg_save[i][0]); } -static struct syscore_ops s5pv210_audss_clk_syscore_ops = { +static const struct syscore_ops s5pv210_audss_clk_syscore_ops = { .suspend = s5pv210_audss_clk_suspend, .resume = s5pv210_audss_clk_resume, }; + +static struct syscore s5pv210_audss_clk_syscore = { + .ops = &s5pv210_audss_clk_syscore_ops, +}; #endif /* CONFIG_PM_SLEEP */ /* register s5pv210_audss clocks */ @@ -175,7 +179,7 @@ static int s5pv210_audss_clk_probe(struct platform_device *pdev) } #ifdef CONFIG_PM_SLEEP - register_syscore_ops(&s5pv210_audss_clk_syscore_ops); + register_syscore(&s5pv210_audss_clk_syscore); #endif return 0; diff --git a/drivers/clk/samsung/clk.c b/drivers/clk/samsung/clk.c index dbc9925ca8f4..c149ca6c2217 100644 --- a/drivers/clk/samsung/clk.c +++ b/drivers/clk/samsung/clk.c @@ -271,7 +271,7 @@ void __init samsung_clk_of_register_fixed_ext(struct samsung_clk_provider *ctx, } #ifdef CONFIG_PM_SLEEP -static int samsung_clk_suspend(void) +static int samsung_clk_suspend(void *data) { struct samsung_clock_reg_cache *reg_cache; @@ -284,7 +284,7 @@ static int samsung_clk_suspend(void) return 0; } -static void samsung_clk_resume(void) +static void samsung_clk_resume(void *data) { struct samsung_clock_reg_cache *reg_cache; @@ -293,11 +293,15 @@ static void samsung_clk_resume(void) reg_cache->rd_num); } -static struct syscore_ops samsung_clk_syscore_ops = { +static const struct syscore_ops samsung_clk_syscore_ops = { .suspend = samsung_clk_suspend, .resume = samsung_clk_resume, }; +static struct syscore samsung_clk_syscore = { + .ops = &samsung_clk_syscore_ops, +}; + void samsung_clk_extended_sleep_init(void __iomem *reg_base, const unsigned long *rdump, unsigned long nr_rdump, @@ -316,7 +320,7 @@ void samsung_clk_extended_sleep_init(void __iomem *reg_base, panic("could not allocate register dump storage.\n"); if (list_empty(&clock_reg_cache_list)) - register_syscore_ops(&samsung_clk_syscore_ops); + register_syscore(&samsung_clk_syscore); reg_cache->reg_base = reg_base; reg_cache->rd_num = nr_rdump; diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c index 412902f573b5..504d0ea997a5 100644 --- a/drivers/clk/tegra/clk-tegra210.c +++ b/drivers/clk/tegra/clk-tegra210.c @@ -3444,7 +3444,7 @@ static void tegra210_disable_cpu_clock(u32 cpu) static u32 spare_reg_ctx, misc_clk_enb_ctx, clk_msk_arm_ctx; static u32 cpu_softrst_ctx[3]; -static int tegra210_clk_suspend(void) +static int tegra210_clk_suspend(void *data) { unsigned int i; @@ -3465,7 +3465,7 @@ static int tegra210_clk_suspend(void) return 0; } -static void tegra210_clk_resume(void) +static void tegra210_clk_resume(void *data) { unsigned int i; @@ -3523,13 +3523,17 @@ static void tegra210_cpu_clock_resume(void) } #endif -static struct syscore_ops tegra_clk_syscore_ops = { +static const struct syscore_ops tegra_clk_syscore_ops = { #ifdef CONFIG_PM_SLEEP .suspend = tegra210_clk_suspend, .resume = tegra210_clk_resume, #endif }; +static struct syscore tegra_clk_syscore = { + .ops = &tegra_clk_syscore_ops, +}; + static struct tegra_cpu_car_ops tegra210_cpu_car_ops = { .wait_for_reset = tegra210_wait_cpu_in_reset, .disable_clock = tegra210_disable_cpu_clock, @@ -3813,6 +3817,6 @@ static void __init tegra210_clock_init(struct device_node *np) tegra_cpu_car_ops = &tegra210_cpu_car_ops; - register_syscore_ops(&tegra_clk_syscore_ops); + register_syscore(&tegra_clk_syscore); } CLK_OF_DECLARE(tegra210, "nvidia,tegra210-car", tegra210_clock_init); diff --git a/drivers/clocksource/timer-armada-370-xp.c b/drivers/clocksource/timer-armada-370-xp.c index 54284c1c0651..f2b4cc40db93 100644 --- a/drivers/clocksource/timer-armada-370-xp.c +++ b/drivers/clocksource/timer-armada-370-xp.c @@ -207,14 +207,14 @@ static int armada_370_xp_timer_dying_cpu(unsigned int cpu) static u32 timer0_ctrl_reg, timer0_local_ctrl_reg; -static int armada_370_xp_timer_suspend(void) +static int armada_370_xp_timer_suspend(void *data) { timer0_ctrl_reg = readl(timer_base + TIMER_CTRL_OFF); timer0_local_ctrl_reg = readl(local_base + TIMER_CTRL_OFF); return 0; } -static void armada_370_xp_timer_resume(void) +static void armada_370_xp_timer_resume(void *data) { writel(0xffffffff, timer_base + TIMER0_VAL_OFF); writel(0xffffffff, timer_base + TIMER0_RELOAD_OFF); @@ -222,11 +222,15 @@ static void armada_370_xp_timer_resume(void) writel(timer0_local_ctrl_reg, local_base + TIMER_CTRL_OFF); } -static struct syscore_ops armada_370_xp_timer_syscore_ops = { +static const struct syscore_ops armada_370_xp_timer_syscore_ops = { .suspend = armada_370_xp_timer_suspend, .resume = armada_370_xp_timer_resume, }; +static struct syscore armada_370_xp_timer_syscore = { + .ops = &armada_370_xp_timer_syscore_ops, +}; + static unsigned long armada_370_delay_timer_read(void) { return ~readl(timer_base + TIMER0_VAL_OFF); @@ -324,7 +328,7 @@ static int __init armada_370_xp_timer_common_init(struct device_node *np) return res; } - register_syscore_ops(&armada_370_xp_timer_syscore_ops); + register_syscore(&armada_370_xp_timer_syscore); return 0; } diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index b19bc60cc627..3372e1f90561 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -177,26 +177,30 @@ static void psci_idle_syscore_switch(bool suspend) } } -static int psci_idle_syscore_suspend(void) +static int psci_idle_syscore_suspend(void *data) { psci_idle_syscore_switch(true); return 0; } -static void psci_idle_syscore_resume(void) +static void psci_idle_syscore_resume(void *data) { psci_idle_syscore_switch(false); } -static struct syscore_ops psci_idle_syscore_ops = { +static const struct syscore_ops psci_idle_syscore_ops = { .suspend = psci_idle_syscore_suspend, .resume = psci_idle_syscore_resume, }; +static struct syscore psci_idle_syscore = { + .ops = &psci_idle_syscore_ops, +}; + static void psci_idle_init_syscore(void) { if (psci_cpuidle_use_syscore) - register_syscore_ops(&psci_idle_syscore_ops); + register_syscore(&psci_idle_syscore); } static void psci_idle_init_cpuhp(void) diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c index 52060b3ec745..d7666fe9dbf8 100644 --- a/drivers/gpio/gpio-mxc.c +++ b/drivers/gpio/gpio-mxc.c @@ -667,7 +667,7 @@ static const struct dev_pm_ops mxc_gpio_dev_pm_ops = { RUNTIME_PM_OPS(mxc_gpio_runtime_suspend, mxc_gpio_runtime_resume, NULL) }; -static int mxc_gpio_syscore_suspend(void) +static int mxc_gpio_syscore_suspend(void *data) { struct mxc_gpio_port *port; int ret; @@ -684,7 +684,7 @@ static int mxc_gpio_syscore_suspend(void) return 0; } -static void mxc_gpio_syscore_resume(void) +static void mxc_gpio_syscore_resume(void *data) { struct mxc_gpio_port *port; int ret; @@ -701,11 +701,15 @@ static void mxc_gpio_syscore_resume(void) } } -static struct syscore_ops mxc_gpio_syscore_ops = { +static const struct syscore_ops mxc_gpio_syscore_ops = { .suspend = mxc_gpio_syscore_suspend, .resume = mxc_gpio_syscore_resume, }; +static struct syscore mxc_gpio_syscore = { + .ops = &mxc_gpio_syscore_ops, +}; + static struct platform_driver mxc_gpio_driver = { .driver = { .name = "gpio-mxc", @@ -718,7 +722,7 @@ static struct platform_driver mxc_gpio_driver = { static int __init gpio_mxc_init(void) { - register_syscore_ops(&mxc_gpio_syscore_ops); + register_syscore(&mxc_gpio_syscore); return platform_driver_register(&mxc_gpio_driver); } diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c index fa22f3faa163..664cf1eef494 100644 --- a/drivers/gpio/gpio-pxa.c +++ b/drivers/gpio/gpio-pxa.c @@ -747,7 +747,7 @@ static int __init pxa_gpio_dt_init(void) device_initcall(pxa_gpio_dt_init); #ifdef CONFIG_PM -static int pxa_gpio_suspend(void) +static int pxa_gpio_suspend(void *data) { struct pxa_gpio_chip *pchip = pxa_gpio_chip; struct pxa_gpio_bank *c; @@ -768,7 +768,7 @@ static int pxa_gpio_suspend(void) return 0; } -static void pxa_gpio_resume(void) +static void pxa_gpio_resume(void *data) { struct pxa_gpio_chip *pchip = pxa_gpio_chip; struct pxa_gpio_bank *c; @@ -792,14 +792,18 @@ static void pxa_gpio_resume(void) #define pxa_gpio_resume NULL #endif -static struct syscore_ops pxa_gpio_syscore_ops = { +static const struct syscore_ops pxa_gpio_syscore_ops = { .suspend = pxa_gpio_suspend, .resume = pxa_gpio_resume, }; +static struct syscore pxa_gpio_syscore = { + .ops = &pxa_gpio_syscore_ops, +}; + static int __init pxa_gpio_sysinit(void) { - register_syscore_ops(&pxa_gpio_syscore_ops); + register_syscore(&pxa_gpio_syscore); return 0; } postcore_initcall(pxa_gpio_sysinit); diff --git a/drivers/gpio/gpio-sa1100.c b/drivers/gpio/gpio-sa1100.c index 7f6a62f5d1ee..1938ffa2f4f3 100644 --- a/drivers/gpio/gpio-sa1100.c +++ b/drivers/gpio/gpio-sa1100.c @@ -256,7 +256,7 @@ static void sa1100_gpio_handler(struct irq_desc *desc) } while (mask); } -static int sa1100_gpio_suspend(void) +static int sa1100_gpio_suspend(void *data) { struct sa1100_gpio_chip *sgc = &sa1100_gpio_chip; @@ -275,19 +275,23 @@ static int sa1100_gpio_suspend(void) return 0; } -static void sa1100_gpio_resume(void) +static void sa1100_gpio_resume(void *data) { sa1100_update_edge_regs(&sa1100_gpio_chip); } -static struct syscore_ops sa1100_gpio_syscore_ops = { +static const struct syscore_ops sa1100_gpio_syscore_ops = { .suspend = sa1100_gpio_suspend, .resume = sa1100_gpio_resume, }; +static struct syscore sa1100_gpio_syscore = { + .ops = &sa1100_gpio_syscore_ops, +}; + static int __init sa1100_gpio_init_devicefs(void) { - register_syscore_ops(&sa1100_gpio_syscore_ops); + register_syscore(&sa1100_gpio_syscore); return 0; } diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 69591dc7bad2..67734dc73e16 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2801,7 +2801,7 @@ static void hv_crash_handler(struct pt_regs *regs) hv_synic_disable_regs(cpu); }; -static int hv_synic_suspend(void) +static int hv_synic_suspend(void *data) { /* * When we reach here, all the non-boot CPUs have been offlined. @@ -2828,7 +2828,7 @@ static int hv_synic_suspend(void) return 0; } -static void hv_synic_resume(void) +static void hv_synic_resume(void *data) { hv_synic_enable_regs(0); @@ -2840,11 +2840,15 @@ static void hv_synic_resume(void) } /* The callbacks run only on CPU0, with irqs_disabled. */ -static struct syscore_ops hv_synic_syscore_ops = { +static const struct syscore_ops hv_synic_syscore_ops = { .suspend = hv_synic_suspend, .resume = hv_synic_resume, }; +static struct syscore hv_synic_syscore = { + .ops = &hv_synic_syscore_ops, +}; + static int __init hv_acpi_init(void) { int ret; @@ -2887,7 +2891,7 @@ static int __init hv_acpi_init(void) hv_setup_kexec_handler(hv_kexec_handler); hv_setup_crash_handler(hv_crash_handler); - register_syscore_ops(&hv_synic_syscore_ops); + register_syscore(&hv_synic_syscore); return 0; @@ -2901,7 +2905,7 @@ static void __exit vmbus_exit(void) { int cpu; - unregister_syscore_ops(&hv_synic_syscore_ops); + unregister_syscore(&hv_synic_syscore); hv_remove_kexec_handler(); hv_remove_crash_handler(); diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index f2991c11867c..b763f4c9c5a7 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3024,7 +3024,7 @@ static void disable_iommus(void) * disable suspend until real resume implemented */ -static void amd_iommu_resume(void) +static void amd_iommu_resume(void *data) { struct amd_iommu *iommu; @@ -3038,7 +3038,7 @@ static void amd_iommu_resume(void) amd_iommu_enable_interrupts(); } -static int amd_iommu_suspend(void) +static int amd_iommu_suspend(void *data) { /* disable IOMMUs to go out of the way for BIOS */ disable_iommus(); @@ -3046,11 +3046,15 @@ static int amd_iommu_suspend(void) return 0; } -static struct syscore_ops amd_iommu_syscore_ops = { +static const struct syscore_ops amd_iommu_syscore_ops = { .suspend = amd_iommu_suspend, .resume = amd_iommu_resume, }; +static struct syscore amd_iommu_syscore = { + .ops = &amd_iommu_syscore_ops, +}; + static void __init free_iommu_resources(void) { free_iommu_all(); @@ -3395,7 +3399,7 @@ static int __init state_next(void) init_state = IOMMU_ENABLED; break; case IOMMU_ENABLED: - register_syscore_ops(&amd_iommu_syscore_ops); + register_syscore(&amd_iommu_syscore); iommu_snp_enable(); ret = amd_iommu_init_pci(); init_state = ret ? IOMMU_INIT_ERROR : IOMMU_PCI_INIT; @@ -3498,12 +3502,12 @@ int __init amd_iommu_enable(void) void amd_iommu_disable(void) { - amd_iommu_suspend(); + amd_iommu_suspend(NULL); } int amd_iommu_reenable(int mode) { - amd_iommu_resume(); + amd_iommu_resume(NULL); return 0; } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e236c7ec221f..fdaf7f64dd33 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2303,7 +2303,7 @@ static void iommu_flush_all(void) } } -static int iommu_suspend(void) +static int iommu_suspend(void *data) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; @@ -2330,7 +2330,7 @@ static int iommu_suspend(void) return 0; } -static void iommu_resume(void) +static void iommu_resume(void *data) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; @@ -2361,14 +2361,18 @@ static void iommu_resume(void) } } -static struct syscore_ops iommu_syscore_ops = { +static const struct syscore_ops iommu_syscore_ops = { .resume = iommu_resume, .suspend = iommu_suspend, }; +static struct syscore iommu_syscore = { + .ops = &iommu_syscore_ops, +}; + static void __init init_iommu_pm_ops(void) { - register_syscore_ops(&iommu_syscore_ops); + register_syscore(&iommu_syscore); } #else diff --git a/drivers/irqchip/exynos-combiner.c b/drivers/irqchip/exynos-combiner.c index e7dfcf0cda43..495848442b35 100644 --- a/drivers/irqchip/exynos-combiner.c +++ b/drivers/irqchip/exynos-combiner.c @@ -200,12 +200,13 @@ static void __init combiner_init(void __iomem *combiner_base, /** * combiner_suspend - save interrupt combiner state before suspend + * @data: syscore context * * Save the interrupt enable set register for all combiner groups since * the state is lost when the system enters into a sleep state. * */ -static int combiner_suspend(void) +static int combiner_suspend(void *data) { int i; @@ -218,12 +219,13 @@ static int combiner_suspend(void) /** * combiner_resume - restore interrupt combiner state after resume + * @data: syscore context * * Restore the interrupt enable set register for all combiner groups since * the state is lost when the system enters into a sleep state on suspend. * */ -static void combiner_resume(void) +static void combiner_resume(void *data) { int i; @@ -240,11 +242,15 @@ static void combiner_resume(void) #define combiner_resume NULL #endif -static struct syscore_ops combiner_syscore_ops = { +static const struct syscore_ops combiner_syscore_ops = { .suspend = combiner_suspend, .resume = combiner_resume, }; +static struct syscore combiner_syscore = { + .ops = &combiner_syscore_ops, +}; + static int __init combiner_of_init(struct device_node *np, struct device_node *parent) { @@ -264,7 +270,7 @@ static int __init combiner_of_init(struct device_node *np, combiner_init(combiner_base, np); - register_syscore_ops(&combiner_syscore_ops); + register_syscore(&combiner_syscore); return 0; } diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c index a44c49e985b7..a4d03a2d1569 100644 --- a/drivers/irqchip/irq-armada-370-xp.c +++ b/drivers/irqchip/irq-armada-370-xp.c @@ -726,7 +726,7 @@ static void __exception_irq_entry mpic_handle_irq(struct pt_regs *regs) } while (1); } -static int mpic_suspend(void) +static int mpic_suspend(void *data) { struct mpic *mpic = mpic_data; @@ -735,7 +735,7 @@ static int mpic_suspend(void) return 0; } -static void mpic_resume(void) +static void mpic_resume(void *data) { struct mpic *mpic = mpic_data; bool src0, src1; @@ -788,11 +788,15 @@ static void mpic_resume(void) mpic_ipi_resume(mpic); } -static struct syscore_ops mpic_syscore_ops = { +static const struct syscore_ops mpic_syscore_ops = { .suspend = mpic_suspend, .resume = mpic_resume, }; +static struct syscore mpic_syscore = { + .ops = &mpic_syscore_ops, +}; + static int __init mpic_map_region(struct device_node *np, int index, void __iomem **base, phys_addr_t *phys_base) { @@ -905,7 +909,7 @@ static int __init mpic_of_init(struct device_node *node, struct device_node *par mpic_handle_cascade_irq, mpic); } - register_syscore_ops(&mpic_syscore_ops); + register_syscore(&mpic_syscore); return 0; } diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c index 04fac0cc857f..cd9a56459f99 100644 --- a/drivers/irqchip/irq-bcm7038-l1.c +++ b/drivers/irqchip/irq-bcm7038-l1.c @@ -292,7 +292,7 @@ static int __init bcm7038_l1_init_one(struct device_node *dn, static LIST_HEAD(bcm7038_l1_intcs_list); static DEFINE_RAW_SPINLOCK(bcm7038_l1_intcs_lock); -static int bcm7038_l1_suspend(void) +static int bcm7038_l1_suspend(void *data) { struct bcm7038_l1_chip *intc; int boot_cpu, word; @@ -318,7 +318,7 @@ static int bcm7038_l1_suspend(void) return 0; } -static void bcm7038_l1_resume(void) +static void bcm7038_l1_resume(void *data) { struct bcm7038_l1_chip *intc; int boot_cpu, word; @@ -339,11 +339,15 @@ static void bcm7038_l1_resume(void) } } -static struct syscore_ops bcm7038_l1_syscore_ops = { +static const struct syscore_ops bcm7038_l1_syscore_ops = { .suspend = bcm7038_l1_suspend, .resume = bcm7038_l1_resume, }; +static struct syscore bcm7038_l1_syscore = { + .ops = &bcm7038_l1_syscore_ops, +}; + static int bcm7038_l1_set_wake(struct irq_data *d, unsigned int on) { struct bcm7038_l1_chip *intc = irq_data_get_irq_chip_data(d); @@ -431,7 +435,7 @@ static int __init bcm7038_l1_of_init(struct device_node *dn, raw_spin_unlock(&bcm7038_l1_intcs_lock); if (list_is_singular(&bcm7038_l1_intcs_list)) - register_syscore_ops(&bcm7038_l1_syscore_ops); + register_syscore(&bcm7038_l1_syscore); #endif pr_info("registered BCM7038 L1 intc (%pOF, IRQs: %d)\n", diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 467cb78435a9..ada585bfa451 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -4992,7 +4992,7 @@ static void its_enable_quirks(struct its_node *its) its_quirks, its); } -static int its_save_disable(void) +static int its_save_disable(void *data) { struct its_node *its; int err = 0; @@ -5028,7 +5028,7 @@ err: return err; } -static void its_restore_enable(void) +static void its_restore_enable(void *data) { struct its_node *its; int ret; @@ -5088,11 +5088,15 @@ static void its_restore_enable(void) raw_spin_unlock(&its_lock); } -static struct syscore_ops its_syscore_ops = { +static const struct syscore_ops its_syscore_ops = { .suspend = its_save_disable, .resume = its_restore_enable, }; +static struct syscore its_syscore = { + .ops = &its_syscore_ops, +}; + static void __init __iomem *its_map_one(struct resource *res, int *err) { void __iomem *its_base; @@ -5864,7 +5868,7 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, } } - register_syscore_ops(&its_syscore_ops); + register_syscore(&its_syscore); return 0; } diff --git a/drivers/irqchip/irq-i8259.c b/drivers/irqchip/irq-i8259.c index 91b2f587119c..cca77f9948a3 100644 --- a/drivers/irqchip/irq-i8259.c +++ b/drivers/irqchip/irq-i8259.c @@ -202,13 +202,13 @@ spurious_8259A_irq: } } -static void i8259A_resume(void) +static void i8259A_resume(void *data) { if (i8259A_auto_eoi >= 0) init_8259A(i8259A_auto_eoi); } -static void i8259A_shutdown(void) +static void i8259A_shutdown(void *data) { /* Put the i8259A into a quiescent state that * the kernel initialization code can get it @@ -220,11 +220,15 @@ static void i8259A_shutdown(void) } } -static struct syscore_ops i8259_syscore_ops = { +static const struct syscore_ops i8259_syscore_ops = { .resume = i8259A_resume, .shutdown = i8259A_shutdown, }; +static struct syscore i8259_syscore = { + .ops = &i8259_syscore_ops, +}; + static void init_8259A(int auto_eoi) { unsigned long flags; @@ -320,7 +324,7 @@ struct irq_domain * __init __init_i8259_irqs(struct device_node *node) if (request_irq(irq, no_action, IRQF_NO_THREAD, "cascade", NULL)) pr_err("Failed to register cascade interrupt\n"); - register_syscore_ops(&i8259_syscore_ops); + register_syscore(&i8259_syscore); return domain; } diff --git a/drivers/irqchip/irq-imx-gpcv2.c b/drivers/irqchip/irq-imx-gpcv2.c index b91f5c14b405..04f7ba0657be 100644 --- a/drivers/irqchip/irq-imx-gpcv2.c +++ b/drivers/irqchip/irq-imx-gpcv2.c @@ -33,7 +33,7 @@ static void __iomem *gpcv2_idx_to_reg(struct gpcv2_irqchip_data *cd, int i) return cd->gpc_base + cd->cpu2wakeup + i * 4; } -static int gpcv2_wakeup_source_save(void) +static int gpcv2_wakeup_source_save(void *data) { struct gpcv2_irqchip_data *cd; void __iomem *reg; @@ -52,7 +52,7 @@ static int gpcv2_wakeup_source_save(void) return 0; } -static void gpcv2_wakeup_source_restore(void) +static void gpcv2_wakeup_source_restore(void *data) { struct gpcv2_irqchip_data *cd; int i; @@ -65,9 +65,13 @@ static void gpcv2_wakeup_source_restore(void) writel_relaxed(cd->saved_irq_mask[i], gpcv2_idx_to_reg(cd, i)); } -static struct syscore_ops imx_gpcv2_syscore_ops = { - .suspend = gpcv2_wakeup_source_save, - .resume = gpcv2_wakeup_source_restore, +static const struct syscore_ops gpcv2_syscore_ops = { + .suspend = gpcv2_wakeup_source_save, + .resume = gpcv2_wakeup_source_restore, +}; + +static struct syscore gpcv2_syscore = { + .ops = &gpcv2_syscore_ops, }; static int imx_gpcv2_irq_set_wake(struct irq_data *d, unsigned int on) @@ -276,7 +280,7 @@ static int __init imx_gpcv2_irqchip_init(struct device_node *node, writel_relaxed(~0x1, cd->gpc_base + cd->cpu2wakeup); imx_gpcv2_instance = cd; - register_syscore_ops(&imx_gpcv2_syscore_ops); + register_syscore(&gpcv2_syscore); /* * Clear the OF_POPULATED flag set in of_irq_init so that diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c index 39e5a72ccd3c..ad2105685b48 100644 --- a/drivers/irqchip/irq-loongson-eiointc.c +++ b/drivers/irqchip/irq-loongson-eiointc.c @@ -407,21 +407,25 @@ static struct irq_domain *acpi_get_vec_parent(int node, struct acpi_vector_group return NULL; } -static int eiointc_suspend(void) +static int eiointc_suspend(void *data) { return 0; } -static void eiointc_resume(void) +static void eiointc_resume(void *data) { eiointc_router_init(0); } -static struct syscore_ops eiointc_syscore_ops = { +static const struct syscore_ops eiointc_syscore_ops = { .suspend = eiointc_suspend, .resume = eiointc_resume, }; +static struct syscore eiointc_syscore = { + .ops = &eiointc_syscore_ops, +}; + static int __init pch_pic_parse_madt(union acpi_subtable_headers *header, const unsigned long end) { @@ -540,7 +544,7 @@ static int __init eiointc_init(struct eiointc_priv *priv, int parent_irq, eiointc_router_init(0); if (nr_pics == 1) { - register_syscore_ops(&eiointc_syscore_ops); + register_syscore(&eiointc_syscore); cpuhp_setup_state_nocalls(CPUHP_AP_IRQ_EIOINTC_STARTING, "irqchip/loongarch/eiointc:starting", eiointc_router_init, NULL); diff --git a/drivers/irqchip/irq-loongson-htpic.c b/drivers/irqchip/irq-loongson-htpic.c index f4abdf156de7..1c691c4be989 100644 --- a/drivers/irqchip/irq-loongson-htpic.c +++ b/drivers/irqchip/irq-loongson-htpic.c @@ -71,15 +71,19 @@ static void htpic_reg_init(void) writel(0xffff, htpic->base + HTINT_EN_OFF); } -static void htpic_resume(void) +static void htpic_resume(void *data) { htpic_reg_init(); } -struct syscore_ops htpic_syscore_ops = { +static const struct syscore_ops htpic_syscore_ops = { .resume = htpic_resume, }; +static struct syscore htpic_syscore = { + .ops = &htpic_syscore_ops, +}; + static int __init htpic_of_init(struct device_node *node, struct device_node *parent) { unsigned int parent_irq[4]; @@ -130,7 +134,7 @@ static int __init htpic_of_init(struct device_node *node, struct device_node *pa htpic_irq_dispatch, htpic); } - register_syscore_ops(&htpic_syscore_ops); + register_syscore(&htpic_syscore); return 0; diff --git a/drivers/irqchip/irq-loongson-htvec.c b/drivers/irqchip/irq-loongson-htvec.c index d8558eb35044..d2be8e954e92 100644 --- a/drivers/irqchip/irq-loongson-htvec.c +++ b/drivers/irqchip/irq-loongson-htvec.c @@ -159,7 +159,7 @@ static void htvec_reset(struct htvec *priv) } } -static int htvec_suspend(void) +static int htvec_suspend(void *data) { int i; @@ -169,7 +169,7 @@ static int htvec_suspend(void) return 0; } -static void htvec_resume(void) +static void htvec_resume(void *data) { int i; @@ -177,11 +177,15 @@ static void htvec_resume(void) writel(htvec_priv->saved_vec_en[i], htvec_priv->base + HTVEC_EN_OFF + 4 * i); } -static struct syscore_ops htvec_syscore_ops = { +static const struct syscore_ops htvec_syscore_ops = { .suspend = htvec_suspend, .resume = htvec_resume, }; +static struct syscore htvec_syscore = { + .ops = &htvec_syscore_ops, +}; + static int htvec_init(phys_addr_t addr, unsigned long size, int num_parents, int parent_irq[], struct fwnode_handle *domain_handle) { @@ -214,7 +218,7 @@ static int htvec_init(phys_addr_t addr, unsigned long size, htvec_priv = priv; - register_syscore_ops(&htvec_syscore_ops); + register_syscore(&htvec_syscore); return 0; diff --git a/drivers/irqchip/irq-loongson-pch-lpc.c b/drivers/irqchip/irq-loongson-pch-lpc.c index 912bf50a5c7c..3a125f3e4287 100644 --- a/drivers/irqchip/irq-loongson-pch-lpc.c +++ b/drivers/irqchip/irq-loongson-pch-lpc.c @@ -151,7 +151,7 @@ static int pch_lpc_disabled(struct pch_lpc *priv) (readl(priv->base + LPC_INT_STS) == 0xffffffff); } -static int pch_lpc_suspend(void) +static int pch_lpc_suspend(void *data) { pch_lpc_priv->saved_reg_ctl = readl(pch_lpc_priv->base + LPC_INT_CTL); pch_lpc_priv->saved_reg_ena = readl(pch_lpc_priv->base + LPC_INT_ENA); @@ -159,18 +159,22 @@ static int pch_lpc_suspend(void) return 0; } -static void pch_lpc_resume(void) +static void pch_lpc_resume(void *data) { writel(pch_lpc_priv->saved_reg_ctl, pch_lpc_priv->base + LPC_INT_CTL); writel(pch_lpc_priv->saved_reg_ena, pch_lpc_priv->base + LPC_INT_ENA); writel(pch_lpc_priv->saved_reg_pol, pch_lpc_priv->base + LPC_INT_POL); } -static struct syscore_ops pch_lpc_syscore_ops = { +static const struct syscore_ops pch_lpc_syscore_ops = { .suspend = pch_lpc_suspend, .resume = pch_lpc_resume, }; +static struct syscore pch_lpc_syscore = { + .ops = &pch_lpc_syscore_ops, +}; + int __init pch_lpc_acpi_init(struct irq_domain *parent, struct acpi_madt_lpc_pic *acpi_pchlpc) { @@ -222,7 +226,7 @@ int __init pch_lpc_acpi_init(struct irq_domain *parent, pch_lpc_priv = priv; pch_lpc_handle = irq_handle; - register_syscore_ops(&pch_lpc_syscore_ops); + register_syscore(&pch_lpc_syscore); return 0; diff --git a/drivers/irqchip/irq-loongson-pch-pic.c b/drivers/irqchip/irq-loongson-pch-pic.c index 62e6bf3a0611..c6b369a974a7 100644 --- a/drivers/irqchip/irq-loongson-pch-pic.c +++ b/drivers/irqchip/irq-loongson-pch-pic.c @@ -278,7 +278,7 @@ static void pch_pic_reset(struct pch_pic *priv) } } -static int pch_pic_suspend(void) +static int pch_pic_suspend(void *data) { int i, j; @@ -296,7 +296,7 @@ static int pch_pic_suspend(void) return 0; } -static void pch_pic_resume(void) +static void pch_pic_resume(void *data) { int i, j; @@ -313,11 +313,15 @@ static void pch_pic_resume(void) } } -static struct syscore_ops pch_pic_syscore_ops = { +static const struct syscore_ops pch_pic_syscore_ops = { .suspend = pch_pic_suspend, .resume = pch_pic_resume, }; +static struct syscore pch_pic_syscore = { + .ops = &pch_pic_syscore_ops, +}; + static int pch_pic_init(phys_addr_t addr, unsigned long size, int vec_base, struct irq_domain *parent_domain, struct fwnode_handle *domain_handle, u32 gsi_base) @@ -356,7 +360,7 @@ static int pch_pic_init(phys_addr_t addr, unsigned long size, int vec_base, pch_pic_priv[nr_pics++] = priv; if (nr_pics == 1) - register_syscore_ops(&pch_pic_syscore_ops); + register_syscore(&pch_pic_syscore); return 0; diff --git a/drivers/irqchip/irq-mchp-eic.c b/drivers/irqchip/irq-mchp-eic.c index 516a3a0e359c..be83e6e422c3 100644 --- a/drivers/irqchip/irq-mchp-eic.c +++ b/drivers/irqchip/irq-mchp-eic.c @@ -109,7 +109,7 @@ static int mchp_eic_irq_set_wake(struct irq_data *d, unsigned int on) return 0; } -static int mchp_eic_irq_suspend(void) +static int mchp_eic_irq_suspend(void *data) { unsigned int hwirq; @@ -123,7 +123,7 @@ static int mchp_eic_irq_suspend(void) return 0; } -static void mchp_eic_irq_resume(void) +static void mchp_eic_irq_resume(void *data) { unsigned int hwirq; @@ -135,11 +135,15 @@ static void mchp_eic_irq_resume(void) MCHP_EIC_SCFG(hwirq)); } -static struct syscore_ops mchp_eic_syscore_ops = { +static const struct syscore_ops mchp_eic_syscore_ops = { .suspend = mchp_eic_irq_suspend, .resume = mchp_eic_irq_resume, }; +static struct syscore mchp_eic_syscore = { + .ops = &mchp_eic_syscore_ops, +}; + static struct irq_chip mchp_eic_chip = { .name = "eic", .flags = IRQCHIP_MASK_ON_SUSPEND | IRQCHIP_SET_TYPE_MASKED, @@ -257,7 +261,7 @@ static int mchp_eic_init(struct device_node *node, struct device_node *parent) goto clk_unprepare; } - register_syscore_ops(&mchp_eic_syscore_ops); + register_syscore(&mchp_eic_syscore); pr_info("%pOF: EIC registered, nr_irqs %u\n", node, MCHP_EIC_NIRQ); diff --git a/drivers/irqchip/irq-mst-intc.c b/drivers/irqchip/irq-mst-intc.c index 9643cc3a77d7..7f760f555a76 100644 --- a/drivers/irqchip/irq-mst-intc.c +++ b/drivers/irqchip/irq-mst-intc.c @@ -143,7 +143,7 @@ static void mst_intc_polarity_restore(struct mst_intc_chip_data *cd) writew_relaxed(cd->saved_polarity_conf[i], addr + i * 4); } -static void mst_irq_resume(void) +static void mst_irq_resume(void *data) { struct mst_intc_chip_data *cd; @@ -151,7 +151,7 @@ static void mst_irq_resume(void) mst_intc_polarity_restore(cd); } -static int mst_irq_suspend(void) +static int mst_irq_suspend(void *data) { struct mst_intc_chip_data *cd; @@ -160,14 +160,18 @@ static int mst_irq_suspend(void) return 0; } -static struct syscore_ops mst_irq_syscore_ops = { +static const struct syscore_ops mst_irq_syscore_ops = { .suspend = mst_irq_suspend, .resume = mst_irq_resume, }; +static struct syscore mst_irq_syscore = { + .ops = &mst_irq_syscore_ops, +}; + static int __init mst_irq_pm_init(void) { - register_syscore_ops(&mst_irq_syscore_ops); + register_syscore(&mst_irq_syscore); return 0; } late_initcall(mst_irq_pm_init); diff --git a/drivers/irqchip/irq-mtk-cirq.c b/drivers/irqchip/irq-mtk-cirq.c index de481ba340f8..9571f622774e 100644 --- a/drivers/irqchip/irq-mtk-cirq.c +++ b/drivers/irqchip/irq-mtk-cirq.c @@ -199,7 +199,7 @@ static const struct irq_domain_ops cirq_domain_ops = { }; #ifdef CONFIG_PM_SLEEP -static int mtk_cirq_suspend(void) +static int mtk_cirq_suspend(void *data) { void __iomem *reg; u32 value, mask; @@ -257,7 +257,7 @@ static int mtk_cirq_suspend(void) return 0; } -static void mtk_cirq_resume(void) +static void mtk_cirq_resume(void *data) { void __iomem *reg = mtk_cirq_reg(cirq_data, CIRQ_CONTROL); u32 value; @@ -272,14 +272,18 @@ static void mtk_cirq_resume(void) writel_relaxed(value, reg); } -static struct syscore_ops mtk_cirq_syscore_ops = { +static const struct syscore_ops mtk_cirq_syscore_ops = { .suspend = mtk_cirq_suspend, .resume = mtk_cirq_resume, }; +static struct syscore mtk_cirq_syscore = { + .ops = &mtk_cirq_syscore_ops, +}; + static void mtk_cirq_syscore_init(void) { - register_syscore_ops(&mtk_cirq_syscore_ops); + register_syscore(&mtk_cirq_syscore); } #else static inline void mtk_cirq_syscore_init(void) {} diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 2a54adeb4cc7..de88e80eac0c 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -399,7 +399,7 @@ static int rzg2l_irqc_set_type(struct irq_data *d, unsigned int type) return irq_chip_set_type_parent(d, IRQ_TYPE_LEVEL_HIGH); } -static int rzg2l_irqc_irq_suspend(void) +static int rzg2l_irqc_irq_suspend(void *data) { struct rzg2l_irqc_reg_cache *cache = &rzg2l_irqc_data->cache; void __iomem *base = rzg2l_irqc_data->base; @@ -411,7 +411,7 @@ static int rzg2l_irqc_irq_suspend(void) return 0; } -static void rzg2l_irqc_irq_resume(void) +static void rzg2l_irqc_irq_resume(void *data) { struct rzg2l_irqc_reg_cache *cache = &rzg2l_irqc_data->cache; void __iomem *base = rzg2l_irqc_data->base; @@ -426,11 +426,15 @@ static void rzg2l_irqc_irq_resume(void) writel_relaxed(cache->iitsr, base + IITSR); } -static struct syscore_ops rzg2l_irqc_syscore_ops = { +static const struct syscore_ops rzg2l_irqc_syscore_ops = { .suspend = rzg2l_irqc_irq_suspend, .resume = rzg2l_irqc_irq_resume, }; +static struct syscore rzg2l_irqc_syscore = { + .ops = &rzg2l_irqc_syscore_ops, +}; + static const struct irq_chip rzg2l_irqc_chip = { .name = "rzg2l-irqc", .irq_eoi = rzg2l_irqc_eoi, @@ -581,7 +585,7 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * return -ENOMEM; } - register_syscore_ops(&rzg2l_irqc_syscore_ops); + register_syscore(&rzg2l_irqc_syscore); /* * Prevent the cleanup function from invoking put_device by assigning diff --git a/drivers/irqchip/irq-sa11x0.c b/drivers/irqchip/irq-sa11x0.c index d8d4dff16276..e5f24c5f3f41 100644 --- a/drivers/irqchip/irq-sa11x0.c +++ b/drivers/irqchip/irq-sa11x0.c @@ -85,7 +85,7 @@ static struct sa1100irq_state { unsigned int iccr; } sa1100irq_state; -static int sa1100irq_suspend(void) +static int sa1100irq_suspend(void *data) { struct sa1100irq_state *st = &sa1100irq_state; @@ -102,7 +102,7 @@ static int sa1100irq_suspend(void) return 0; } -static void sa1100irq_resume(void) +static void sa1100irq_resume(void *data) { struct sa1100irq_state *st = &sa1100irq_state; @@ -114,14 +114,18 @@ static void sa1100irq_resume(void) } } -static struct syscore_ops sa1100irq_syscore_ops = { +static const struct syscore_ops sa1100irq_syscore_ops = { .suspend = sa1100irq_suspend, .resume = sa1100irq_resume, }; +static struct syscore sa1100irq_syscore = { + .ops = &sa1100irq_syscore_ops, +}; + static int __init sa1100irq_init_devicefs(void) { - register_syscore_ops(&sa1100irq_syscore_ops); + register_syscore(&sa1100irq_syscore); return 0; } diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index cbd7697bc148..4f59c0ca1537 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -245,7 +245,7 @@ static int plic_irq_set_type(struct irq_data *d, unsigned int type) return IRQ_SET_MASK_OK; } -static int plic_irq_suspend(void) +static int plic_irq_suspend(void *data) { unsigned int i, cpu; unsigned long flags; @@ -277,7 +277,7 @@ static int plic_irq_suspend(void) return 0; } -static void plic_irq_resume(void) +static void plic_irq_resume(void *data) { unsigned int i, index, cpu; unsigned long flags; @@ -308,11 +308,15 @@ static void plic_irq_resume(void) } } -static struct syscore_ops plic_irq_syscore_ops = { +static const struct syscore_ops plic_irq_syscore_ops = { .suspend = plic_irq_suspend, .resume = plic_irq_resume, }; +static struct syscore plic_irq_syscore = { + .ops = &plic_irq_syscore_ops, +}; + static int plic_irqdomain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq) { @@ -678,7 +682,7 @@ done: cpuhp_setup_state(CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, "irqchip/sifive/plic:starting", plic_starting_cpu, plic_dying_cpu); - register_syscore_ops(&plic_irq_syscore_ops); + register_syscore(&plic_irq_syscore); plic_global_setup_done = true; } } diff --git a/drivers/irqchip/irq-sun6i-r.c b/drivers/irqchip/irq-sun6i-r.c index 37d4b29763bc..23251831c06e 100644 --- a/drivers/irqchip/irq-sun6i-r.c +++ b/drivers/irqchip/irq-sun6i-r.c @@ -268,7 +268,7 @@ static const struct irq_domain_ops sun6i_r_intc_domain_ops = { .free = irq_domain_free_irqs_common, }; -static int sun6i_r_intc_suspend(void) +static int sun6i_r_intc_suspend(void *data) { u32 buf[BITS_TO_U32(MAX(SUN6I_NR_TOP_LEVEL_IRQS, SUN6I_NR_MUX_BITS))]; int i; @@ -284,7 +284,7 @@ static int sun6i_r_intc_suspend(void) return 0; } -static void sun6i_r_intc_resume(void) +static void sun6i_r_intc_resume(void *data) { int i; @@ -294,17 +294,21 @@ static void sun6i_r_intc_resume(void) writel_relaxed(0, base + SUN6I_IRQ_ENABLE(i)); } -static void sun6i_r_intc_shutdown(void) +static void sun6i_r_intc_shutdown(void *data) { - sun6i_r_intc_suspend(); + sun6i_r_intc_suspend(data); } -static struct syscore_ops sun6i_r_intc_syscore_ops = { +static const struct syscore_ops sun6i_r_intc_syscore_ops = { .suspend = sun6i_r_intc_suspend, .resume = sun6i_r_intc_resume, .shutdown = sun6i_r_intc_shutdown, }; +static struct syscore sun6i_r_intc_syscore = { + .ops = &sun6i_r_intc_syscore_ops, +}; + static int __init sun6i_r_intc_init(struct device_node *node, struct device_node *parent, const struct sun6i_r_intc_variant *v) @@ -346,10 +350,10 @@ static int __init sun6i_r_intc_init(struct device_node *node, return -ENOMEM; } - register_syscore_ops(&sun6i_r_intc_syscore_ops); + register_syscore(&sun6i_r_intc_syscore); sun6i_r_intc_ack_nmi(); - sun6i_r_intc_resume(); + sun6i_r_intc_resume(NULL); return 0; } diff --git a/drivers/irqchip/irq-tegra.c b/drivers/irqchip/irq-tegra.c index 66cbb9f77ff3..b6382cf6359a 100644 --- a/drivers/irqchip/irq-tegra.c +++ b/drivers/irqchip/irq-tegra.c @@ -132,7 +132,7 @@ static int tegra_set_wake(struct irq_data *d, unsigned int enable) return 0; } -static int tegra_ictlr_suspend(void) +static int tegra_ictlr_suspend(void *data) { unsigned long flags; unsigned int i; @@ -161,7 +161,7 @@ static int tegra_ictlr_suspend(void) return 0; } -static void tegra_ictlr_resume(void) +static void tegra_ictlr_resume(void *data) { unsigned long flags; unsigned int i; @@ -184,14 +184,18 @@ static void tegra_ictlr_resume(void) local_irq_restore(flags); } -static struct syscore_ops tegra_ictlr_syscore_ops = { +static const struct syscore_ops tegra_ictlr_syscore_ops = { .suspend = tegra_ictlr_suspend, .resume = tegra_ictlr_resume, }; +static struct syscore tegra_ictlr_syscore = { + .ops = &tegra_ictlr_syscore_ops, +}; + static void tegra_ictlr_syscore_init(void) { - register_syscore_ops(&tegra_ictlr_syscore_ops); + register_syscore(&tegra_ictlr_syscore); } #else #define tegra_set_wake NULL diff --git a/drivers/irqchip/irq-vic.c b/drivers/irqchip/irq-vic.c index 2bcdf216a000..e38104c5064e 100644 --- a/drivers/irqchip/irq-vic.c +++ b/drivers/irqchip/irq-vic.c @@ -120,7 +120,7 @@ static void resume_one_vic(struct vic_device *vic) writel(~vic->soft_int, base + VIC_INT_SOFT_CLEAR); } -static void vic_resume(void) +static void vic_resume(void *data) { int id; @@ -146,7 +146,7 @@ static void suspend_one_vic(struct vic_device *vic) writel(~vic->resume_irqs, base + VIC_INT_ENABLE_CLEAR); } -static int vic_suspend(void) +static int vic_suspend(void *data) { int id; @@ -156,11 +156,15 @@ static int vic_suspend(void) return 0; } -static struct syscore_ops vic_syscore_ops = { +static const struct syscore_ops vic_syscore_ops = { .suspend = vic_suspend, .resume = vic_resume, }; +static struct syscore vic_syscore = { + .ops = &vic_syscore_ops, +}; + /** * vic_pm_init - initcall to register VIC pm * @@ -171,7 +175,7 @@ static struct syscore_ops vic_syscore_ops = { static int __init vic_pm_init(void) { if (vic_id > 0) - register_syscore_ops(&vic_syscore_ops); + register_syscore(&vic_syscore); return 0; } diff --git a/drivers/leds/trigger/ledtrig-cpu.c b/drivers/leds/trigger/ledtrig-cpu.c index 05848a2fecff..679323c2ccda 100644 --- a/drivers/leds/trigger/ledtrig-cpu.c +++ b/drivers/leds/trigger/ledtrig-cpu.c @@ -94,28 +94,32 @@ void ledtrig_cpu(enum cpu_led_event ledevt) } EXPORT_SYMBOL(ledtrig_cpu); -static int ledtrig_cpu_syscore_suspend(void) +static int ledtrig_cpu_syscore_suspend(void *data) { ledtrig_cpu(CPU_LED_STOP); return 0; } -static void ledtrig_cpu_syscore_resume(void) +static void ledtrig_cpu_syscore_resume(void *data) { ledtrig_cpu(CPU_LED_START); } -static void ledtrig_cpu_syscore_shutdown(void) +static void ledtrig_cpu_syscore_shutdown(void *data) { ledtrig_cpu(CPU_LED_HALTED); } -static struct syscore_ops ledtrig_cpu_syscore_ops = { +static const struct syscore_ops ledtrig_cpu_syscore_ops = { .shutdown = ledtrig_cpu_syscore_shutdown, .suspend = ledtrig_cpu_syscore_suspend, .resume = ledtrig_cpu_syscore_resume, }; +static struct syscore ledtrig_cpu_syscore = { + .ops = &ledtrig_cpu_syscore_ops, +}; + static int ledtrig_online_cpu(unsigned int cpu) { ledtrig_cpu(CPU_LED_START); @@ -157,7 +161,7 @@ static int __init ledtrig_cpu_init(void) led_trigger_register_simple(trig->name, &trig->_trig); } - register_syscore_ops(&ledtrig_cpu_syscore_ops); + register_syscore(&ledtrig_cpu_syscore); ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "leds/trigger:starting", ledtrig_online_cpu, ledtrig_prepare_down_cpu); diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index b0f09c70f1ff..5fe47e784d43 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -2600,7 +2600,7 @@ void pmu_blink(int n) #if defined(CONFIG_SUSPEND) && defined(CONFIG_PPC32) int pmu_sys_suspended; -static int pmu_syscore_suspend(void) +static int pmu_syscore_suspend(void *data) { /* Suspend PMU event interrupts */ pmu_suspend(); @@ -2614,7 +2614,7 @@ static int pmu_syscore_suspend(void) return 0; } -static void pmu_syscore_resume(void) +static void pmu_syscore_resume(void *data) { struct adb_request req; @@ -2634,14 +2634,18 @@ static void pmu_syscore_resume(void) pmu_sys_suspended = 0; } -static struct syscore_ops pmu_syscore_ops = { +static const struct syscore_ops pmu_syscore_ops = { .suspend = pmu_syscore_suspend, .resume = pmu_syscore_resume, }; +static struct syscore pmu_syscore = { + .ops = &pmu_syscore_ops, +}; + static int pmu_syscore_register(void) { - register_syscore_ops(&pmu_syscore_ops); + register_syscore(&pmu_syscore); return 0; } diff --git a/drivers/power/reset/sc27xx-poweroff.c b/drivers/power/reset/sc27xx-poweroff.c index 90287c31992c..393bd1c33b73 100644 --- a/drivers/power/reset/sc27xx-poweroff.c +++ b/drivers/power/reset/sc27xx-poweroff.c @@ -28,7 +28,7 @@ static struct regmap *regmap; * taking cpus down to avoid racing regmap or spi mutex lock when poweroff * system through PMIC. */ -static void sc27xx_poweroff_shutdown(void) +static void sc27xx_poweroff_shutdown(void *data) { #ifdef CONFIG_HOTPLUG_CPU int cpu; @@ -40,10 +40,14 @@ static void sc27xx_poweroff_shutdown(void) #endif } -static struct syscore_ops poweroff_syscore_ops = { +static const struct syscore_ops poweroff_syscore_ops = { .shutdown = sc27xx_poweroff_shutdown, }; +static struct syscore poweroff_syscore = { + .ops = &poweroff_syscore_ops, +}; + static void sc27xx_poweroff_do_poweroff(void) { /* Disable the external subsys connection's power firstly */ @@ -62,7 +66,7 @@ static int sc27xx_poweroff_probe(struct platform_device *pdev) return -ENODEV; pm_power_off = sc27xx_poweroff_do_poweroff; - register_syscore_ops(&poweroff_syscore_ops); + register_syscore(&poweroff_syscore); return 0; } diff --git a/drivers/sh/clk/core.c b/drivers/sh/clk/core.c index 7a73f5e4a1fc..f02e12dfa5f6 100644 --- a/drivers/sh/clk/core.c +++ b/drivers/sh/clk/core.c @@ -569,7 +569,7 @@ long clk_round_rate(struct clk *clk, unsigned long rate) EXPORT_SYMBOL_GPL(clk_round_rate); #ifdef CONFIG_PM -static void clks_core_resume(void) +static void clks_core_resume(void *data) { struct clk *clkp; @@ -588,13 +588,17 @@ static void clks_core_resume(void) } } -static struct syscore_ops clks_syscore_ops = { +static const struct syscore_ops clks_syscore_ops = { .resume = clks_core_resume, }; +static struct syscore clks_syscore = { + .ops = &clks_syscore_ops, +}; + static int __init clk_syscore_init(void) { - register_syscore_ops(&clks_syscore_ops); + register_syscore(&clks_syscore); return 0; } diff --git a/drivers/sh/intc/core.c b/drivers/sh/intc/core.c index ea571eeb3078..3dde703b7766 100644 --- a/drivers/sh/intc/core.c +++ b/drivers/sh/intc/core.c @@ -394,7 +394,7 @@ err0: return -ENOMEM; } -static int intc_suspend(void) +static int intc_suspend(void *data) { struct intc_desc_int *d; @@ -420,7 +420,7 @@ static int intc_suspend(void) return 0; } -static void intc_resume(void) +static void intc_resume(void *data) { struct intc_desc_int *d; @@ -450,11 +450,15 @@ static void intc_resume(void) } } -struct syscore_ops intc_syscore_ops = { +static const struct syscore_ops intc_syscore_ops = { .suspend = intc_suspend, .resume = intc_resume, }; +static struct syscore intc_syscore = { + .ops = &intc_syscore_ops, +}; + const struct bus_type intc_subsys = { .name = "intc", .dev_name = "intc", @@ -477,7 +481,7 @@ static int __init register_intc_devs(void) struct intc_desc_int *d; int error; - register_syscore_ops(&intc_syscore_ops); + register_syscore(&intc_syscore); error = subsys_system_register(&intc_subsys, NULL); if (!error) { diff --git a/drivers/soc/bcm/brcmstb/biuctrl.c b/drivers/soc/bcm/brcmstb/biuctrl.c index 364ddbe365c2..bd830649b60d 100644 --- a/drivers/soc/bcm/brcmstb/biuctrl.c +++ b/drivers/soc/bcm/brcmstb/biuctrl.c @@ -298,7 +298,7 @@ out: #ifdef CONFIG_PM_SLEEP static u32 cpubiuctrl_reg_save[NUM_CPU_BIUCTRL_REGS]; -static int brcmstb_cpu_credit_reg_suspend(void) +static int brcmstb_cpu_credit_reg_suspend(void *data) { unsigned int i; @@ -311,7 +311,7 @@ static int brcmstb_cpu_credit_reg_suspend(void) return 0; } -static void brcmstb_cpu_credit_reg_resume(void) +static void brcmstb_cpu_credit_reg_resume(void *data) { unsigned int i; @@ -322,10 +322,14 @@ static void brcmstb_cpu_credit_reg_resume(void) cbc_writel(cpubiuctrl_reg_save[i], i); } -static struct syscore_ops brcmstb_cpu_credit_syscore_ops = { +static const struct syscore_ops brcmstb_cpu_credit_syscore_ops = { .suspend = brcmstb_cpu_credit_reg_suspend, .resume = brcmstb_cpu_credit_reg_resume, }; + +static struct syscore brcmstb_cpu_credit_syscore = { + .ops = &brcmstb_cpu_credit_syscore_ops, +}; #endif @@ -354,7 +358,7 @@ static int __init brcmstb_biuctrl_init(void) a72_b53_rac_enable_all(np); mcp_a72_b53_set(); #ifdef CONFIG_PM_SLEEP - register_syscore_ops(&brcmstb_cpu_credit_syscore_ops); + register_syscore(&brcmstb_cpu_credit_syscore); #endif ret = 0; out_put: diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c index 034a2a535a1e..93bbebd68001 100644 --- a/drivers/soc/tegra/pmc.c +++ b/drivers/soc/tegra/pmc.c @@ -466,7 +466,7 @@ struct tegra_pmc { unsigned long *wake_type_dual_edge_map; unsigned long *wake_sw_status_map; unsigned long *wake_cntrl_level_map; - struct syscore_ops syscore; + struct syscore syscore; }; static struct tegra_pmc *pmc = &(struct tegra_pmc) { @@ -3147,7 +3147,7 @@ static void tegra186_pmc_process_wake_events(struct tegra_pmc *pmc, unsigned int } } -static void tegra186_pmc_wake_syscore_resume(void) +static void tegra186_pmc_wake_syscore_resume(void *data) { u32 status, mask; unsigned int i; @@ -3160,7 +3160,7 @@ static void tegra186_pmc_wake_syscore_resume(void) } } -static int tegra186_pmc_wake_syscore_suspend(void) +static int tegra186_pmc_wake_syscore_suspend(void *data) { wke_read_sw_wake_status(pmc); @@ -3179,6 +3179,11 @@ static int tegra186_pmc_wake_syscore_suspend(void) return 0; } +static const struct syscore_ops tegra186_pmc_wake_syscore_ops = { + .suspend = tegra186_pmc_wake_syscore_suspend, + .resume = tegra186_pmc_wake_syscore_resume, +}; + #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_ARM) static int tegra_pmc_suspend(struct device *dev) { @@ -3829,10 +3834,8 @@ static const struct tegra_pmc_regs tegra186_pmc_regs = { static void tegra186_pmc_init(struct tegra_pmc *pmc) { - pmc->syscore.suspend = tegra186_pmc_wake_syscore_suspend; - pmc->syscore.resume = tegra186_pmc_wake_syscore_resume; - - register_syscore_ops(&pmc->syscore); + pmc->syscore.ops = &tegra186_pmc_wake_syscore_ops; + register_syscore(&pmc->syscore); } static void tegra186_pmc_setup_irq_polarity(struct tegra_pmc *pmc, diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c index bd2fca7dc017..8a2f441cd2ec 100644 --- a/drivers/thermal/intel/intel_hfi.c +++ b/drivers/thermal/intel/intel_hfi.c @@ -592,7 +592,7 @@ static void hfi_disable_instance(void *ptr) hfi_disable(); } -static void hfi_syscore_resume(void) +static void hfi_syscore_resume(void *data) { /* This code runs only on the boot CPU. */ struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, 0); @@ -603,7 +603,7 @@ static void hfi_syscore_resume(void) hfi_enable_instance(hfi_instance); } -static int hfi_syscore_suspend(void) +static int hfi_syscore_suspend(void *data) { /* No locking needed. There is no concurrency with CPU offline. */ hfi_disable(); @@ -611,11 +611,15 @@ static int hfi_syscore_suspend(void) return 0; } -static struct syscore_ops hfi_pm_ops = { +static const struct syscore_ops hfi_pm_ops = { .resume = hfi_syscore_resume, .suspend = hfi_syscore_suspend, }; +static struct syscore hfi_pm = { + .ops = &hfi_pm_ops, +}; + static int hfi_thermal_notify(struct notifier_block *nb, unsigned long state, void *_notify) { @@ -710,7 +714,7 @@ void __init intel_hfi_init(void) if (thermal_genl_register_notifier(&hfi_thermal_nb)) goto err_nl_notif; - register_syscore_ops(&hfi_pm_ops); + register_syscore(&hfi_pm); return; diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 296703939846..f2e8eaf684ba 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -495,7 +495,7 @@ static void xen_acpi_processor_resume_worker(struct work_struct *dummy) pr_info("ACPI data upload failed, error = %d\n", rc); } -static void xen_acpi_processor_resume(void) +static void xen_acpi_processor_resume(void *data) { static DECLARE_WORK(wq, xen_acpi_processor_resume_worker); @@ -509,10 +509,14 @@ static void xen_acpi_processor_resume(void) schedule_work(&wq); } -static struct syscore_ops xap_syscore_ops = { +static const struct syscore_ops xap_syscore_ops = { .resume = xen_acpi_processor_resume, }; +static struct syscore xap_syscore = { + .ops = &xap_syscore_ops, +}; + static int __init xen_acpi_processor_init(void) { int i; @@ -563,7 +567,7 @@ static int __init xen_acpi_processor_init(void) if (rc) goto err_unregister; - register_syscore_ops(&xap_syscore_ops); + register_syscore(&xap_syscore); return 0; err_unregister: @@ -580,7 +584,7 @@ static void __exit xen_acpi_processor_exit(void) { int i; - unregister_syscore_ops(&xap_syscore_ops); + unregister_syscore(&xap_syscore); bitmap_free(acpi_ids_done); bitmap_free(acpi_id_present); bitmap_free(acpi_id_cst_present); diff --git a/include/linux/syscore_ops.h b/include/linux/syscore_ops.h index ae4d48e4c970..ac6d71be5c38 100644 --- a/include/linux/syscore_ops.h +++ b/include/linux/syscore_ops.h @@ -11,14 +11,19 @@ #include struct syscore_ops { + int (*suspend)(void *data); + void (*resume)(void *data); + void (*shutdown)(void *data); +}; + +struct syscore { struct list_head node; - int (*suspend)(void); - void (*resume)(void); - void (*shutdown)(void); + const struct syscore_ops *ops; + void *data; }; -extern void register_syscore_ops(struct syscore_ops *ops); -extern void unregister_syscore_ops(struct syscore_ops *ops); +extern void register_syscore(struct syscore *syscore); +extern void unregister_syscore(struct syscore *syscore); #ifdef CONFIG_PM_SLEEP extern int syscore_suspend(void); extern void syscore_resume(void); diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index b0f0d15085db..7481fbb947d3 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -173,7 +173,7 @@ int cpu_cluster_pm_exit(void) EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); #ifdef CONFIG_PM -static int cpu_pm_suspend(void) +static int cpu_pm_suspend(void *data) { int ret; @@ -185,20 +185,24 @@ static int cpu_pm_suspend(void) return ret; } -static void cpu_pm_resume(void) +static void cpu_pm_resume(void *data) { cpu_cluster_pm_exit(); cpu_pm_exit(); } -static struct syscore_ops cpu_pm_syscore_ops = { +static const struct syscore_ops cpu_pm_syscore_ops = { .suspend = cpu_pm_suspend, .resume = cpu_pm_resume, }; +static struct syscore cpu_pm_syscore = { + .ops = &cpu_pm_syscore_ops, +}; + static int cpu_pm_init(void) { - register_syscore_ops(&cpu_pm_syscore_ops); + register_syscore(&cpu_pm_syscore); return 0; } core_initcall(cpu_pm_init); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index bf59e37d650a..3cd0c40282c0 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -650,7 +650,7 @@ static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) } #ifdef CONFIG_PM -static int irq_gc_suspend(void) +static int irq_gc_suspend(void *data) { struct irq_chip_generic *gc; @@ -670,7 +670,7 @@ static int irq_gc_suspend(void) return 0; } -static void irq_gc_resume(void) +static void irq_gc_resume(void *data) { struct irq_chip_generic *gc; @@ -693,7 +693,7 @@ static void irq_gc_resume(void) #define irq_gc_resume NULL #endif -static void irq_gc_shutdown(void) +static void irq_gc_shutdown(void *data) { struct irq_chip_generic *gc; @@ -709,15 +709,19 @@ static void irq_gc_shutdown(void) } } -static struct syscore_ops irq_gc_syscore_ops = { +static const struct syscore_ops irq_gc_syscore_ops = { .suspend = irq_gc_suspend, .resume = irq_gc_resume, .shutdown = irq_gc_shutdown, }; +static struct syscore irq_gc_syscore = { + .ops = &irq_gc_syscore_ops, +}; + static int __init irq_gc_init_ops(void) { - register_syscore_ops(&irq_gc_syscore_ops); + register_syscore(&irq_gc_syscore); return 0; } device_initcall(irq_gc_init_ops); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f7394729cedc..99ff65466d87 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -211,21 +211,26 @@ void rearm_wake_irq(unsigned int irq) /** * irq_pm_syscore_resume - enable interrupt lines early + * @data: syscore context * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ -static void irq_pm_syscore_resume(void) +static void irq_pm_syscore_resume(void *data) { resume_irqs(true); } -static struct syscore_ops irq_pm_syscore_ops = { +static const struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; +static struct syscore irq_pm_syscore = { + .ops = &irq_pm_syscore_ops, +}; + static int __init irq_pm_init_ops(void) { - register_syscore_ops(&irq_pm_syscore_ops); + register_syscore(&irq_pm_syscore); return 0; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5aee9ffb16b9..f852dce94015 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3639,12 +3639,13 @@ static bool legacy_kthread_create(void) /** * printk_kthreads_shutdown - shutdown all threaded printers + * @data: syscore context * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ -static void printk_kthreads_shutdown(void) +static void printk_kthreads_shutdown(void *data) { struct console *con; @@ -3666,10 +3667,14 @@ static void printk_kthreads_shutdown(void) console_list_unlock(); } -static struct syscore_ops printk_syscore_ops = { +static const struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; +static struct syscore printk_syscore = { + .ops = &printk_syscore_ops, +}; + /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. @@ -3737,7 +3742,7 @@ static void printk_kthreads_check_locked(void) static int __init printk_set_kthreads_ready(void) { - register_syscore_ops(&printk_syscore_ops); + register_syscore(&printk_syscore); console_list_lock(); printk_kthreads_ready = true; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cc1afec306b3..f39111830ca3 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -296,6 +296,11 @@ int sched_clock_suspend(void) return 0; } +static int sched_clock_syscore_suspend(void *data) +{ + return sched_clock_suspend(); +} + void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; @@ -305,14 +310,23 @@ void sched_clock_resume(void) rd->read_sched_clock = cd.actual_read_sched_clock; } -static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, +static void sched_clock_syscore_resume(void *data) +{ + sched_clock_resume(); +} + +static const struct syscore_ops sched_clock_syscore_ops = { + .suspend = sched_clock_syscore_suspend, + .resume = sched_clock_syscore_resume, +}; + +static struct syscore sched_clock_syscore = { + .ops = &sched_clock_syscore_ops, }; static int __init sched_clock_syscore_init(void) { - register_syscore_ops(&sched_clock_ops); + register_syscore(&sched_clock_syscore); return 0; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b6974fce800c..f3513679ee09 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1994,6 +1994,11 @@ void timekeeping_resume(void) timerfd_resume(); } +static void timekeeping_syscore_resume(void *data) +{ + timekeeping_resume(); +} + int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; @@ -2061,15 +2066,24 @@ int timekeeping_suspend(void) return 0; } +static int timekeeping_syscore_suspend(void *data) +{ + return timekeeping_suspend(); +} + /* sysfs resume/suspend bits for timekeeping */ -static struct syscore_ops timekeeping_syscore_ops = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, +static const struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_syscore_resume, + .suspend = timekeeping_syscore_suspend, +}; + +static struct syscore timekeeping_syscore = { + .ops = &timekeeping_syscore_ops, }; static int __init timekeeping_init_ops(void) { - register_syscore_ops(&timekeeping_syscore_ops); + register_syscore(&timekeeping_syscore); return 0; } device_initcall(timekeeping_init_ops); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 226faeaa8e56..b7675a58d663 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5629,7 +5629,7 @@ static int kvm_offline_cpu(unsigned int cpu) return 0; } -static void kvm_shutdown(void) +static void kvm_shutdown(void *data) { /* * Disable hardware virtualization and set kvm_rebooting to indicate @@ -5647,7 +5647,7 @@ static void kvm_shutdown(void) on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } -static int kvm_suspend(void) +static int kvm_suspend(void *data) { /* * Secondary CPUs and CPU hotplug are disabled across the suspend/resume @@ -5664,7 +5664,7 @@ static int kvm_suspend(void) return 0; } -static void kvm_resume(void) +static void kvm_resume(void *data) { lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); @@ -5672,12 +5672,16 @@ static void kvm_resume(void) WARN_ON_ONCE(kvm_enable_virtualization_cpu()); } -static struct syscore_ops kvm_syscore_ops = { +static const struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, .shutdown = kvm_shutdown, }; +static struct syscore kvm_syscore = { + .ops = &kvm_syscore_ops, +}; + int kvm_enable_virtualization(void) { int r; @@ -5694,7 +5698,7 @@ int kvm_enable_virtualization(void) if (r) goto err_cpuhp; - register_syscore_ops(&kvm_syscore_ops); + register_syscore(&kvm_syscore); /* * Undo virtualization enabling and bail if the system is going down. @@ -5716,7 +5720,7 @@ int kvm_enable_virtualization(void) return 0; err_rebooting: - unregister_syscore_ops(&kvm_syscore_ops); + unregister_syscore(&kvm_syscore); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); err_cpuhp: kvm_arch_disable_virtualization(); @@ -5732,7 +5736,7 @@ void kvm_disable_virtualization(void) if (--kvm_usage_count) return; - unregister_syscore_ops(&kvm_syscore_ops); + unregister_syscore(&kvm_syscore); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } -- cgit v1.2.3 From 0559730b8570259ef948e9083653f8a87baba182 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Mon, 22 Sep 2025 11:43:28 +0200 Subject: pwm: Drop unused function pwm_apply_args() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function pwm_apply_args() was introduced with the concept of atomic PWM configuration and needed for drivers not using this concept yet. Now all drivers are converted accordingly and so no callers are left which allows to remove this function. Signed-off-by: Uwe Kleine-König Link: https://patch.msgid.link/20250922094327.1143944-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 549ac4aaad59..e59be4e382d1 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -611,39 +611,6 @@ devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode, } #endif -static inline void pwm_apply_args(struct pwm_device *pwm) -{ - struct pwm_state state = { }; - - /* - * PWM users calling pwm_apply_args() expect to have a fresh config - * where the polarity and period are set according to pwm_args info. - * The problem is, polarity can only be changed when the PWM is - * disabled. - * - * PWM drivers supporting hardware readout may declare the PWM device - * as enabled, and prevent polarity setting, which changes from the - * existing behavior, where all PWM devices are declared as disabled - * at startup (even if they are actually enabled), thus authorizing - * polarity setting. - * - * To fulfill this requirement, we apply a new state which disables - * the PWM device and set the reference period and polarity config. - * - * Note that PWM users requiring a smooth handover between the - * bootloader and the kernel (like critical regulators controlled by - * PWM devices) will have to switch to the atomic API and avoid calling - * pwm_apply_args(). - */ - - state.enabled = false; - state.polarity = pwm->args.polarity; - state.period = pwm->args.period; - state.usage_power = false; - - pwm_apply_might_sleep(pwm, &state); -} - struct pwm_lookup { struct list_head list; const char *provider; -- cgit v1.2.3 From 37f0c7a8df7ad719a68fa1c2dbf066cfebc391a7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 14 Nov 2025 11:07:04 +0200 Subject: block-dma: properly take MMIO path In commit eadaa8b255f3 ("dma-mapping: introduce new DMA attribute to indicate MMIO memory"), DMA_ATTR_MMIO attribute was added to describe MMIO addresses, which require to avoid any memory cache flushing, as an outcome of the discussion pointed in Link tag below. In case of PCI_P2PDMA_MAP_THRU_HOST_BRIDGE transfer, blk-mq-dm logic treated this as regular page and relied on "struct page" DMA flow. That flow performs CPU cache flushing, which shouldn't be done here, and doesn't set IOMMU_MMIO flag in DMA-IOMMU case. As a solution, let's encode peer-to-peer transaction type in NVMe IOD flags variable and provide it to blk-mq-dma API. Link: https://lore.kernel.org/all/f912c446-1ae9-4390-9c11-00dce7bf0fd3@arm.com/ Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Signed-off-by: Leon Romanovsky Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 18 +++++++---- drivers/nvme/host/pci.c | 73 ++++++++++++++++++++++++++++++++++++++----- include/linux/bio-integrity.h | 1 - include/linux/blk-integrity.h | 14 --------- include/linux/blk-mq-dma.h | 28 ++++++++--------- include/linux/blk_types.h | 2 -- 6 files changed, 90 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index cebfead826ee..e9108ccaf4b0 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -92,8 +92,13 @@ static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter, struct phys_vec *vec) { + unsigned int attrs = 0; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len, - rq_dma_dir(req), 0); + rq_dma_dir(req), attrs); if (dma_mapping_error(dma_dev, iter->addr)) { iter->status = BLK_STS_RESOURCE; return false; @@ -108,14 +113,18 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, { enum dma_data_direction dir = rq_dma_dir(req); unsigned int mapped = 0; + unsigned int attrs = 0; int error; iter->addr = state->addr; iter->len = dma_iova_size(state); + if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + do { error = dma_iova_link(dma_dev, state, vec->paddr, mapped, - vec->len, dir, 0); + vec->len, dir, attrs); if (error) break; mapped += vec->len; @@ -162,6 +171,7 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; + iter->p2pdma.map = PCI_P2PDMA_MAP_NONE; /* * Grab the first segment ASAP because we'll need it to check for P2P @@ -173,10 +183,6 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, phys_to_page(vec.paddr))) { case PCI_P2PDMA_MAP_BUS_ADDR: - if (iter->iter.is_integrity) - bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA; - else - req->cmd_flags |= REQ_P2PDMA; return blk_dma_map_bus(iter, &vec); case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d0dd836ccdb9..9085bed107fd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -260,8 +260,20 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, + /* Data payload contains p2p memory */ + IOD_DATA_P2P = 1U << 3, + + /* Metadata contains p2p memory */ + IOD_META_P2P = 1U << 4, + + /* Data payload contains MMIO memory */ + IOD_DATA_MMIO = 1U << 5, + + /* Metadata contains MMIO memory */ + IOD_META_MMIO = 1U << 6, + /* Metadata using non-coalesced MPTR */ - IOD_SINGLE_META_SEGMENT = 1U << 5, + IOD_SINGLE_META_SEGMENT = 1U << 7, }; struct nvme_dma_vec { @@ -733,10 +745,12 @@ static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge, static void nvme_unmap_metadata(struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE; enum dma_data_direction dir = rq_dma_dir(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct device *dma_dev = nvmeq->dev->dev; struct nvme_sgl_desc *sge = iod->meta_descriptor; + unsigned int attrs = 0; if (iod->flags & IOD_SINGLE_META_SEGMENT) { dma_unmap_page(dma_dev, iod->meta_dma, @@ -745,13 +759,20 @@ static void nvme_unmap_metadata(struct request *req) return; } - if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state, - iod->meta_total_len)) { + if (iod->flags & IOD_META_P2P) + map = PCI_P2PDMA_MAP_BUS_ADDR; + else if (iod->flags & IOD_META_MMIO) { + map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; + attrs |= DMA_ATTR_MMIO; + } + + if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state, + iod->meta_total_len, map)) { if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) - nvme_free_sgls(req, sge, &sge[1], 0); + nvme_free_sgls(req, sge, &sge[1], attrs); else dma_unmap_phys(dma_dev, iod->meta_dma, - iod->meta_total_len, dir, 0); + iod->meta_total_len, dir, attrs); } if (iod->meta_descriptor) @@ -761,9 +782,11 @@ static void nvme_unmap_metadata(struct request *req) static void nvme_unmap_data(struct request *req) { + enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct device *dma_dev = nvmeq->dev->dev; + unsigned int attrs = 0; if (iod->flags & IOD_SINGLE_SEGMENT) { static_assert(offsetof(union nvme_data_ptr, prp1) == @@ -773,12 +796,20 @@ static void nvme_unmap_data(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { + if (iod->flags & IOD_DATA_P2P) + map = PCI_P2PDMA_MAP_BUS_ADDR; + else if (iod->flags & IOD_DATA_MMIO) { + map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; + attrs |= DMA_ATTR_MMIO; + } + + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, + map)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) nvme_free_sgls(req, iod->descriptors[0], - &iod->cmd.common.dptr.sgl, 0); + &iod->cmd.common.dptr.sgl, attrs); else - nvme_free_prps(req, 0); + nvme_free_prps(req, attrs); } if (iod->nr_descriptors) @@ -1049,6 +1080,19 @@ static blk_status_t nvme_map_data(struct request *req) if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) return iter.status; + switch (iter.p2pdma.map) { + case PCI_P2PDMA_MAP_BUS_ADDR: + iod->flags |= IOD_DATA_P2P; + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + iod->flags |= IOD_DATA_MMIO; + break; + case PCI_P2PDMA_MAP_NONE: + break; + default: + return BLK_STS_RESOURCE; + } + if (use_sgl == SGL_FORCED || (use_sgl == SGL_SUPPORTED && (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) @@ -1071,6 +1115,19 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) &iod->meta_dma_state, &iter)) return iter.status; + switch (iter.p2pdma.map) { + case PCI_P2PDMA_MAP_BUS_ADDR: + iod->flags |= IOD_META_P2P; + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + iod->flags |= IOD_META_MMIO; + break; + case PCI_P2PDMA_MAP_NONE: + break; + default: + return BLK_STS_RESOURCE; + } + if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) entries = 1; diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 3d05296a5afe..21e4652dcfd2 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -13,7 +13,6 @@ enum bip_flags { BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ - BIP_P2P_DMA = 1 << 8, /* using P2P address */ BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index c2030fd8ba0a..a6b84206eb94 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -33,14 +33,6 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); -static inline bool blk_rq_integrity_dma_unmap(struct request *req, - struct device *dma_dev, struct dma_iova_state *state, - size_t mapped_len) -{ - return blk_dma_unmap(req, dma_dev, state, mapped_len, - bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); -} - int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); @@ -129,12 +121,6 @@ static inline int blk_rq_map_integrity_sg(struct request *q, { return 0; } -static inline bool blk_rq_integrity_dma_unmap(struct request *req, - struct device *dma_dev, struct dma_iova_state *state, - size_t mapped_len) -{ - return false; -} static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 51829958d872..cb88fc791fbd 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -16,13 +16,13 @@ struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; u32 len; + struct pci_p2pdma_map_state p2pdma; /* Status code. Only valid when blk_rq_dma_map_iter_* returned false */ blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ struct blk_map_iter iter; - struct pci_p2pdma_map_state p2pdma; }; bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, @@ -43,36 +43,34 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) } /** - * blk_dma_unmap - try to DMA unmap a request + * blk_rq_dma_unmap - try to DMA unmap a request * @req: request to unmap * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap - * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR + * @map: peer-to-peer mapping type * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ -static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len, bool is_p2p) +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len, + enum pci_p2pdma_map_type map) { - if (is_p2p) + if (map == PCI_P2PDMA_MAP_BUS_ADDR) return true; if (dma_use_iova(state)) { + unsigned int attrs = 0; + + if (map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req), - 0); + attrs); return true; } return !dma_need_unmap(dma_dev); } - -static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) -{ - return blk_dma_unmap(req, dma_dev, state, mapped_len, - req->cmd_flags & REQ_P2PDMA); -} - #endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 53501ebb0623..d884cc1256ec 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -393,7 +393,6 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ - __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -426,7 +425,6 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) -#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) -- cgit v1.2.3 From cefd55bd2159f427228d44864747243946296739 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 11 Nov 2025 22:29:44 +0100 Subject: nsproxy: fix free_nsproxy() and simplify create_new_namespaces() Make it possible to handle NULL being passed to the reference count helpers instead of forcing the caller to handle this. Afterwards we can nicely allow a cleanup guard to handle nsproxy freeing. Active reference count handling is not done in nsproxy_free() but rather in free_nsproxy() as nsproxy_free() is also called from setns() failure paths where a new nsproxy has been prepared but has not been marked as active via switch_task_namespaces(). Link: https://lore.kernel.org/690bfb9e.050a0220.2e3c35.0013.GAE@google.com Link: https://patch.msgid.link/20251111-sakralbau-guthaben-7dcc277d337f@brauner Fixes: 3c9820d5c64a ("ns: add active reference count") Reported-by: syzbot+0b2e79f91ff6579bfa5b@syzkaller.appspotmail.com Reported-by: syzbot+0a8655a80e189278487e@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 11 +++++++---- include/linux/nsproxy.h | 4 ++-- kernel/nsproxy.c | 36 ++++++++++++++++++++---------------- 3 files changed, 29 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 136f6a322e53..825f5865bfc5 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -114,11 +114,14 @@ static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common } #define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns))) -#define ns_ref_inc(__ns) __ns_ref_inc(to_ns_common((__ns))) -#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) -#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) +#define ns_ref_inc(__ns) \ + do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0) +#define ns_ref_get(__ns) \ + ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false) +#define ns_ref_put(__ns) \ + ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false) #define ns_ref_put_and_lock(__ns, __ns_lock) \ - __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) + ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false) #define ns_ref_active_read(__ns) \ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index ac825eddec59..5a67648721c7 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -99,7 +99,7 @@ void get_cred_namespaces(struct task_struct *tsk); void exit_cred_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); -void free_nsproxy(struct nsproxy *ns); +void deactivate_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); @@ -107,7 +107,7 @@ int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { if (refcount_dec_and_test(&ns->count)) - free_nsproxy(ns); + deactivate_nsproxy(ns); } static inline void get_nsproxy(struct nsproxy *ns) diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 94c2cfe0afa1..259c4b4f1eeb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -60,6 +60,25 @@ static inline struct nsproxy *create_nsproxy(void) return nsproxy; } +static inline void nsproxy_free(struct nsproxy *ns) +{ + put_mnt_ns(ns->mnt_ns); + put_uts_ns(ns->uts_ns); + put_ipc_ns(ns->ipc_ns); + put_pid_ns(ns->pid_ns_for_children); + put_time_ns(ns->time_ns); + put_time_ns(ns->time_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); + put_net(ns->net_ns); + kmem_cache_free(nsproxy_cachep, ns); +} + +void deactivate_nsproxy(struct nsproxy *ns) +{ + nsproxy_ns_active_put(ns); + nsproxy_free(ns); +} + /* * Create new nsproxy and all of its the associated namespaces. * Return the newly created nsproxy. Do not attach this to the task, @@ -185,21 +204,6 @@ int copy_namespaces(u64 flags, struct task_struct *tsk) return 0; } -void free_nsproxy(struct nsproxy *ns) -{ - nsproxy_ns_active_put(ns); - - put_mnt_ns(ns->mnt_ns); - put_uts_ns(ns->uts_ns); - put_ipc_ns(ns->ipc_ns); - put_pid_ns(ns->pid_ns_for_children); - put_time_ns(ns->time_ns); - put_time_ns(ns->time_ns_for_children); - put_cgroup_ns(ns->cgroup_ns); - put_net(ns->net_ns); - kmem_cache_free(nsproxy_cachep, ns); -} - /* * Called from unshare. Unshare all the namespaces part of nsproxy. * On success, returns the new nsproxy. @@ -338,7 +342,7 @@ static void put_nsset(struct nsset *nsset) if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) free_fs_struct(nsset->fs); if (nsset->nsproxy) - free_nsproxy(nsset->nsproxy); + nsproxy_free(nsset->nsproxy); } static int prepare_nsset(unsigned flags, struct nsset *nsset) -- cgit v1.2.3 From 4037d966f034ba5da2872c413b2ec17eca867e68 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:25 +1100 Subject: VFS: introduce start_dirop() and end_dirop() The fact that directory operations (create,remove,rename) are protected by a lock on the parent is known widely throughout the kernel. In order to change this - to instead lock the target dentry - it is best to centralise this knowledge so it can be changed in one place. This patch introduces start_dirop() which is local to VFS code. It performs the required locking for create and remove. Rename will be handled separately. Various functions with names like start_creating() or start_removing_path(), some of which already exist, will export this functionality beyond the VFS. end_dirop() is the partner of start_dirop(). It drops the lock and releases the reference on the dentry. It *is* exported so that various end_creating etc functions can be inline. As vfs_mkdir() drops the dentry on error we cannot use end_dirop() as that won't unlock when the dentry IS_ERR(). For now we need an explicit unlock when dentry IS_ERR(). I hope to change vfs_mkdir() to unlock when it drops a dentry so that explicit unlock can go away. end_dirop() can always be called on the result of start_dirop(), but not after vfs_mkdir(). After a vfs_mkdir() we still may need the explicit unlock as seen in end_creating_path(). As well as adding start_dirop() and end_dirop() this patch uses them in: - simple_start_creating (which requires sharing lookup_noperm_common() with libfs.c) - start_removing_path / start_removing_user_path_at - filename_create / end_creating_path() - do_rmdir(), do_unlinkat() Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-3-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/internal.h | 3 ++ fs/libfs.c | 36 ++++++++++---------- fs/namei.c | 98 ++++++++++++++++++++++++++++++++++++++++-------------- include/linux/fs.h | 2 ++ 4 files changed, 95 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/fs/internal.h b/fs/internal.h index 9b2b4d116880..d08d5e2235e9 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -67,6 +67,9 @@ int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode); struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); +struct dentry *start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags); +int lookup_noperm_common(struct qstr *qname, struct dentry *base); /* * namespace.c diff --git a/fs/libfs.c b/fs/libfs.c index ce8c496a6940..02371f45ef7d 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2289,27 +2289,25 @@ void stashed_dentry_prune(struct dentry *dentry) cmpxchg(stashed, dentry, NULL); } -/* parent must be held exclusive */ +/** + * simple_start_creating - prepare to create a given name + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Required lock is taken and a lookup in performed prior to creating an + * object in a directory. No permission checking is performed. + * + * Returns: a negative dentry on which vfs_create() or similar may + * be attempted, or an error. + */ struct dentry *simple_start_creating(struct dentry *parent, const char *name) { - struct dentry *dentry; - struct inode *dir = d_inode(parent); + struct qstr qname = QSTR(name); + int err; - inode_lock(dir); - if (unlikely(IS_DEADDIR(dir))) { - inode_unlock(dir); - return ERR_PTR(-ENOENT); - } - dentry = lookup_noperm(&QSTR(name), parent); - if (IS_ERR(dentry)) { - inode_unlock(dir); - return dentry; - } - if (dentry->d_inode) { - dput(dentry); - inode_unlock(dir); - return ERR_PTR(-EEXIST); - } - return dentry; + err = lookup_noperm_common(&qname, parent); + if (err) + return ERR_PTR(err); + return start_dirop(parent, &qname, LOOKUP_CREATE | LOOKUP_EXCL); } EXPORT_SYMBOL(simple_start_creating); diff --git a/fs/namei.c b/fs/namei.c index 7377020a2cba..3618efd4bcaa 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2765,6 +2765,48 @@ static int filename_parentat(int dfd, struct filename *name, return __filename_parentat(dfd, name, flags, parent, last, type, NULL); } +/** + * start_dirop - begin a create or remove dirop, performing locking and lookup + * @parent: the dentry of the parent in which the operation will occur + * @name: a qstr holding the name within that parent + * @lookup_flags: intent and other lookup flags. + * + * The lookup is performed and necessary locks are taken so that, on success, + * the returned dentry can be operated on safely. + * The qstr must already have the hash value calculated. + * + * Returns: a locked dentry, or an error. + * + */ +struct dentry *start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags) +{ + struct dentry *dentry; + struct inode *dir = d_inode(parent); + + inode_lock_nested(dir, I_MUTEX_PARENT); + dentry = lookup_one_qstr_excl(name, parent, lookup_flags); + if (IS_ERR(dentry)) + inode_unlock(dir); + return dentry; +} + +/** + * end_dirop - signal completion of a dirop + * @de: the dentry which was returned by start_dirop or similar. + * + * If the de is an error, nothing happens. Otherwise any lock taken to + * protect the dentry is dropped and the dentry itself is release (dput()). + */ +void end_dirop(struct dentry *de) +{ + if (!IS_ERR(de)) { + inode_unlock(de->d_parent->d_inode); + dput(de); + } +} +EXPORT_SYMBOL(end_dirop); + /* does lookup, returns the object with parent locked */ static struct dentry *__start_removing_path(int dfd, struct filename *name, struct path *path) @@ -2781,10 +2823,9 @@ static struct dentry *__start_removing_path(int dfd, struct filename *name, return ERR_PTR(-EINVAL); /* don't fail immediately if it's r/o, at least try to report other errors */ error = mnt_want_write(parent_path.mnt); - inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); + d = start_dirop(parent_path.dentry, &last, 0); if (IS_ERR(d)) - goto unlock; + goto drop; if (error) goto fail; path->dentry = no_free_ptr(parent_path.dentry); @@ -2792,10 +2833,9 @@ static struct dentry *__start_removing_path(int dfd, struct filename *name, return d; fail: - dput(d); + end_dirop(d); d = ERR_PTR(error); -unlock: - inode_unlock(parent_path.dentry->d_inode); +drop: if (!error) mnt_drop_write(parent_path.mnt); return d; @@ -2910,7 +2950,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_noperm_common(struct qstr *qname, struct dentry *base) +int lookup_noperm_common(struct qstr *qname, struct dentry *base) { const char *name = qname->name; u32 len = qname->len; @@ -4223,21 +4263,18 @@ static struct dentry *filename_create(int dfd, struct filename *name, */ if (last.name[last.len] && !want_dir) create_flags &= ~LOOKUP_CREATE; - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr_excl(&last, path->dentry, - reval_flag | create_flags); + dentry = start_dirop(path->dentry, &last, reval_flag | create_flags); if (IS_ERR(dentry)) - goto unlock; + goto out_drop_write; if (unlikely(error)) goto fail; return dentry; fail: - dput(dentry); + end_dirop(dentry); dentry = ERR_PTR(error); -unlock: - inode_unlock(path->dentry->d_inode); +out_drop_write: if (!error) mnt_drop_write(path->mnt); out: @@ -4256,11 +4293,26 @@ struct dentry *start_creating_path(int dfd, const char *pathname, } EXPORT_SYMBOL(start_creating_path); +/** + * end_creating_path - finish a code section started by start_creating_path() + * @path: the path instantiated by start_creating_path() + * @dentry: the dentry returned by start_creating_path() + * + * end_creating_path() will unlock and locks taken by start_creating_path() + * and drop an references that were taken. It should only be called + * if start_creating_path() returned a non-error. + * If vfs_mkdir() was called and it returned an error, that error *should* + * be passed to end_creating_path() together with the path. + */ void end_creating_path(const struct path *path, struct dentry *dentry) { - if (!IS_ERR(dentry)) - dput(dentry); - inode_unlock(path->dentry->d_inode); + if (IS_ERR(dentry)) + /* The parent is still locked despite the error from + * vfs_mkdir() - must unlock it. + */ + inode_unlock(path->dentry->d_inode); + else + end_dirop(dentry); mnt_drop_write(path->mnt); path_put(path); } @@ -4592,8 +4644,7 @@ retry: if (error) goto exit2; - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); + dentry = start_dirop(path.dentry, &last, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; @@ -4602,9 +4653,8 @@ retry: goto exit4; error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: - dput(dentry); + end_dirop(dentry); exit3: - inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); @@ -4721,8 +4771,7 @@ retry: if (error) goto exit2; retry_deleg: - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); + dentry = start_dirop(path.dentry, &last, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -4737,9 +4786,8 @@ retry_deleg: error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, &delegated_inode); exit3: - dput(dentry); + end_dirop(dentry); } - inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..f4543612ef1e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3609,6 +3609,8 @@ extern void iterate_supers_type(struct file_system_type *, void filesystems_freeze(void); void filesystems_thaw(void); +void end_dirop(struct dentry *de); + extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); -- cgit v1.2.3 From 7ab96df840e60eb933abfe65fc5fe44e72f16dc0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:27 +1100 Subject: VFS/nfsd/cachefiles/ovl: add start_creating() and end_creating() start_creating() is similar to simple_start_creating() but is not so simple. It takes a qstr for the name, includes permission checking, and does NOT report an error if the name already exists, returning a positive dentry instead. This is currently used by nfsd, cachefiles, and overlayfs. end_creating() is called after the dentry has been used. end_creating() drops the reference to the dentry as it is generally no longer needed. This is exactly the first section of end_creating_path() so that function is changed to call the new end_creating() These calls help encapsulate locking rules so that directory locking can be changed. Occasionally this change means that the parent lock is held for a shorter period of time, for example in cachefiles_commit_tmpfile(). As this function now unlocks after an unlink and before the following lookup, it is possible that the lookup could again find a positive dentry, so a while loop is introduced there. In overlayfs the ovl_lookup_temp() function has ovl_tempname() split out to be used in ovl_start_creating_temp(). The other use of ovl_lookup_temp() is preparing for a rename. When rename handling is updated, ovl_lookup_temp() will be removed. Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-5-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/cachefiles/namei.c | 41 +++++++++---------- fs/namei.c | 35 +++++++++++++---- fs/nfsd/nfs3proc.c | 14 +++---- fs/nfsd/nfs4proc.c | 14 +++---- fs/nfsd/nfs4recover.c | 16 +++----- fs/nfsd/nfsproc.c | 11 +++--- fs/nfsd/vfs.c | 52 ++++++++++-------------- fs/overlayfs/copy_up.c | 19 ++++----- fs/overlayfs/dir.c | 100 +++++++++++++++++++++++++++-------------------- fs/overlayfs/overlayfs.h | 8 ++++ fs/overlayfs/super.c | 32 ++++++++------- include/linux/namei.h | 33 ++++++++++++++++ 12 files changed, 215 insertions(+), 160 deletions(-) (limited to 'include/linux') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d1edb2ac3837..0a136eb434da 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -93,12 +93,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); retry: ret = cachefiles_inject_read_error(); if (ret == 0) - subdir = lookup_one(&nop_mnt_idmap, &QSTR(dirname), dir); + subdir = start_creating(&nop_mnt_idmap, dir, &QSTR(dirname)); else subdir = ERR_PTR(ret); trace_cachefiles_lookup(NULL, dir, subdir); @@ -141,7 +140,7 @@ retry: trace_cachefiles_mkdir(dir, subdir); if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) { - dput(subdir); + end_creating(subdir, dir); goto retry; } ASSERT(d_backing_inode(subdir)); @@ -154,7 +153,8 @@ retry: /* Tell rmdir() it's not allowed to delete the subdir */ inode_lock(d_inode(subdir)); - inode_unlock(d_inode(dir)); + dget(subdir); + end_creating(subdir, dir); if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", @@ -196,14 +196,11 @@ mark_error: return ERR_PTR(-EBUSY); mkdir_error: - inode_unlock(d_inode(dir)); - if (!IS_ERR(subdir)) - dput(subdir); + end_creating(subdir, dir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - inode_unlock(d_inode(dir)); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); @@ -679,36 +676,41 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, _enter(",%pD", object->file); - inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); ret = cachefiles_inject_read_error(); if (ret == 0) - dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan); + dentry = start_creating(&nop_mnt_idmap, fan, &QSTR(object->d_name)); else dentry = ERR_PTR(ret); if (IS_ERR(dentry)) { trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), cachefiles_trace_lookup_error); _debug("lookup fail %ld", PTR_ERR(dentry)); - goto out_unlock; + goto out; } - if (!d_is_negative(dentry)) { + /* + * This loop will only execute more than once if some other thread + * races to create the object we are trying to create. + */ + while (!d_is_negative(dentry)) { ret = cachefiles_unlink(volume->cache, object, fan, dentry, FSCACHE_OBJECT_IS_STALE); if (ret < 0) - goto out_dput; + goto out_end; + + end_creating(dentry, fan); - dput(dentry); ret = cachefiles_inject_read_error(); if (ret == 0) - dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan); + dentry = start_creating(&nop_mnt_idmap, fan, + &QSTR(object->d_name)); else dentry = ERR_PTR(ret); if (IS_ERR(dentry)) { trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), cachefiles_trace_lookup_error); _debug("lookup fail %ld", PTR_ERR(dentry)); - goto out_unlock; + goto out; } } @@ -729,10 +731,9 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, success = true; } -out_dput: - dput(dentry); -out_unlock: - inode_unlock(d_inode(fan)); +out_end: + end_creating(dentry, fan); +out: _leave(" = %u", success); return success; } diff --git a/fs/namei.c b/fs/namei.c index 9effaad115d9..9972b0257a4c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3221,6 +3221,33 @@ struct dentry *lookup_noperm_positive_unlocked(struct qstr *name, } EXPORT_SYMBOL(lookup_noperm_positive_unlocked); +/** + * start_creating - prepare to create a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup is performed prior to creating + * an object in a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name already exists, a positive dentry is returned, so + * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail + * with -EEXIST. + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, LOOKUP_CREATE); +} +EXPORT_SYMBOL(start_creating); + #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { @@ -4306,13 +4333,7 @@ EXPORT_SYMBOL(start_creating_path); */ void end_creating_path(const struct path *path, struct dentry *dentry) { - if (IS_ERR(dentry)) - /* The parent is still locked despite the error from - * vfs_mkdir() - must unlock it. - */ - inode_unlock(path->dentry->d_inode); - else - end_dirop(dentry); + end_creating(dentry, path->dentry); mnt_drop_write(path->mnt); path_put(path); } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index b6d03e1ef5f7..e2aac0def2cb 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -281,14 +281,11 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err) return nfserrno(host_err); - inode_lock_nested(inode, I_MUTEX_PARENT); - - child = lookup_one(&nop_mnt_idmap, - &QSTR_LEN(argp->name, argp->len), - parent); + child = start_creating(&nop_mnt_idmap, parent, + &QSTR_LEN(argp->name, argp->len)); if (IS_ERR(child)) { status = nfserrno(PTR_ERR(child)); - goto out; + goto out_write; } if (d_really_is_negative(child)) { @@ -367,9 +364,8 @@ set_attr: status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs); out: - inode_unlock(inode); - if (child && !IS_ERR(child)) - dput(child); + end_creating(child, parent); +out_write: fh_drop_write(fhp); return status; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e466cf52d7d7..b2c95e8e7c68 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -264,14 +264,11 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (is_create_with_attrs(open)) nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs); - inode_lock_nested(inode, I_MUTEX_PARENT); - - child = lookup_one(&nop_mnt_idmap, - &QSTR_LEN(open->op_fname, open->op_fnamelen), - parent); + child = start_creating(&nop_mnt_idmap, parent, + &QSTR_LEN(open->op_fname, open->op_fnamelen)); if (IS_ERR(child)) { status = nfserrno(PTR_ERR(child)); - goto out; + goto out_write; } if (d_really_is_negative(child)) { @@ -379,10 +376,9 @@ set_attr: if (attrs.na_aclerr) open->op_bmval[0] &= ~FATTR4_WORD0_ACL; out: - inode_unlock(inode); + end_creating(child, parent); nfsd_attrs_free(&attrs); - if (child && !IS_ERR(child)) - dput(child); +out_write: fh_drop_write(fhp); return status; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e2b9472e5c78..c247a7c3291c 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -195,13 +195,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) goto out_creds; dir = nn->rec_file->f_path.dentry; - /* lock the parent */ - inode_lock(d_inode(dir)); - dentry = lookup_one(&nop_mnt_idmap, &QSTR(dname), dir); + dentry = start_creating(&nop_mnt_idmap, dir, &QSTR(dname)); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); - goto out_unlock; + goto out; } if (d_really_is_positive(dentry)) /* @@ -212,15 +210,13 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * In the 4.0 case, we should never get here; but we may * as well be forgiving and just succeed silently. */ - goto out_put; + goto out_end; dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); if (IS_ERR(dentry)) status = PTR_ERR(dentry); -out_put: - if (!status) - dput(dentry); -out_unlock: - inode_unlock(d_inode(dir)); +out_end: + end_creating(dentry, dir); +out: if (status == 0) { if (nn->in_grace) __nfsd4_create_reclaim_record_grace(clp, dname, diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 8f71f5748c75..ee1b16e921fd 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -306,18 +306,16 @@ nfsd_proc_create(struct svc_rqst *rqstp) goto done; } - inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT); - dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(argp->name, argp->len), - dirfhp->fh_dentry); + dchild = start_creating(&nop_mnt_idmap, dirfhp->fh_dentry, + &QSTR_LEN(argp->name, argp->len)); if (IS_ERR(dchild)) { resp->status = nfserrno(PTR_ERR(dchild)); - goto out_unlock; + goto out_write; } fh_init(newfhp, NFS_FHSIZE); resp->status = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); if (!resp->status && d_really_is_negative(dchild)) resp->status = nfserr_noent; - dput(dchild); if (resp->status) { if (resp->status != nfserr_noent) goto out_unlock; @@ -423,7 +421,8 @@ nfsd_proc_create(struct svc_rqst *rqstp) } out_unlock: - inode_unlock(dirfhp->fh_dentry->d_inode); + end_creating(dchild, dirfhp->fh_dentry); +out_write: fh_drop_write(dirfhp); done: fh_put(dirfhp); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9cb20d4aeab1..4efd3688e081 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1521,7 +1521,7 @@ nfsd_check_ignore_resizing(struct iattr *iap) iap->ia_valid &= ~ATTR_SIZE; } -/* The parent directory should already be locked: */ +/* The parent directory should already be locked - we will unlock */ __be32 nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_attrs *attrs, @@ -1587,8 +1587,9 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs); out: - if (!IS_ERR(dchild)) - dput(dchild); + if (!err) + fh_fill_post_attrs(fhp); + end_creating(dchild, dentry); return err; out_nfserr: @@ -1626,28 +1627,26 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err) return nfserrno(host_err); - inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT); - dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); + dchild = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); host_err = PTR_ERR(dchild); - if (IS_ERR(dchild)) { - err = nfserrno(host_err); - goto out_unlock; - } + if (IS_ERR(dchild)) + return nfserrno(host_err); + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); /* * We unconditionally drop our ref to dchild as fh_compose will have * already grabbed its own ref for it. */ - dput(dchild); if (err) goto out_unlock; err = fh_fill_pre_attrs(fhp); if (err != nfs_ok) goto out_unlock; err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp); - fh_fill_post_attrs(fhp); + return err; + out_unlock: - inode_unlock(dentry->d_inode); + end_creating(dchild, dentry); return err; } @@ -1733,11 +1732,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, } dentry = fhp->fh_dentry; - inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT); - dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); + dnew = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); if (IS_ERR(dnew)) { err = nfserrno(PTR_ERR(dnew)); - inode_unlock(dentry->d_inode); goto out_drop_write; } err = fh_fill_pre_attrs(fhp); @@ -1750,11 +1747,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_create_setattr(rqstp, fhp, resfhp, attrs); fh_fill_post_attrs(fhp); out_unlock: - inode_unlock(dentry->d_inode); + end_creating(dnew, dentry); if (!err) err = nfserrno(commit_metadata(fhp)); - dput(dnew); - if (err==0) err = cerr; + if (!err) + err = cerr; out_drop_write: fh_drop_write(fhp); out: @@ -1809,32 +1806,31 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, ddir = ffhp->fh_dentry; dirp = d_inode(ddir); - inode_lock_nested(dirp, I_MUTEX_PARENT); + dnew = start_creating(&nop_mnt_idmap, ddir, &QSTR_LEN(name, len)); - dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(name, len), ddir); if (IS_ERR(dnew)) { host_err = PTR_ERR(dnew); - goto out_unlock; + goto out_drop_write; } dold = tfhp->fh_dentry; err = nfserr_noent; if (d_really_is_negative(dold)) - goto out_dput; + goto out_unlock; err = fh_fill_pre_attrs(ffhp); if (err != nfs_ok) - goto out_dput; + goto out_unlock; host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); - inode_unlock(dirp); +out_unlock: + end_creating(dnew, ddir); if (!host_err) { host_err = commit_metadata(ffhp); if (!host_err) host_err = commit_metadata(tfhp); } - dput(dnew); out_drop_write: fh_drop_write(tfhp); if (host_err == -EBUSY) { @@ -1849,12 +1845,6 @@ out_drop_write: } out: return err != nfs_ok ? err : nfserrno(host_err); - -out_dput: - dput(dnew); -out_unlock: - inode_unlock(dirp); - goto out_drop_write; } static void diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index aac7e34f56c1..7a31ca9bdea2 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -613,9 +613,9 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) if (err) goto out; - inode_lock_nested(udir, I_MUTEX_PARENT); - upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir, - c->dentry->d_name.len); + upper = ovl_start_creating_upper(ofs, upperdir, + &QSTR_LEN(c->dentry->d_name.name, + c->dentry->d_name.len)); err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper); @@ -626,9 +626,8 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); ovl_dentry_update_reval(c->dentry, upper); } - dput(upper); + end_creating(upper, upperdir); } - inode_unlock(udir); if (err) goto out; @@ -894,16 +893,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) goto out; - inode_lock_nested(udir, I_MUTEX_PARENT); - - upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, - c->destname.len); + upper = ovl_start_creating_upper(ofs, c->destdir, + &QSTR_LEN(c->destname.name, + c->destname.len)); err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, temp, udir, upper); - dput(upper); + end_creating(upper, c->destdir); } - inode_unlock(udir); if (err) goto out; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index a5e9ddf3023b..f0728547f7d7 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -59,15 +59,21 @@ int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, return 0; } -struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) +#define OVL_TEMPNAME_SIZE 20 +static void ovl_tempname(char name[OVL_TEMPNAME_SIZE]) { - struct dentry *temp; - char name[20]; static atomic_t temp_id = ATOMIC_INIT(0); /* counter is allowed to wrap, since temp dentries are ephemeral */ - snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); + snprintf(name, OVL_TEMPNAME_SIZE, "#%x", atomic_inc_return(&temp_id)); +} + +struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) +{ + struct dentry *temp; + char name[OVL_TEMPNAME_SIZE]; + ovl_tempname(name); temp = ovl_lookup_upper(ofs, name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { pr_err("workdir/%s already exists\n", name); @@ -78,48 +84,52 @@ struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) return temp; } +static struct dentry *ovl_start_creating_temp(struct ovl_fs *ofs, + struct dentry *workdir) +{ + char name[OVL_TEMPNAME_SIZE]; + + ovl_tempname(name); + return start_creating(ovl_upper_mnt_idmap(ofs), workdir, + &QSTR(name)); +} + static struct dentry *ovl_whiteout(struct ovl_fs *ofs) { int err; - struct dentry *whiteout; + struct dentry *whiteout, *link; struct dentry *workdir = ofs->workdir; struct inode *wdir = workdir->d_inode; guard(mutex)(&ofs->whiteout_lock); if (!ofs->whiteout) { - inode_lock_nested(wdir, I_MUTEX_PARENT); - whiteout = ovl_lookup_temp(ofs, workdir); - if (!IS_ERR(whiteout)) { - err = ovl_do_whiteout(ofs, wdir, whiteout); - if (err) { - dput(whiteout); - whiteout = ERR_PTR(err); - } - } - inode_unlock(wdir); + whiteout = ovl_start_creating_temp(ofs, workdir); if (IS_ERR(whiteout)) return whiteout; - ofs->whiteout = whiteout; + err = ovl_do_whiteout(ofs, wdir, whiteout); + if (!err) + ofs->whiteout = dget(whiteout); + end_creating(whiteout, workdir); + if (err) + return ERR_PTR(err); } if (!ofs->no_shared_whiteout) { - inode_lock_nested(wdir, I_MUTEX_PARENT); - whiteout = ovl_lookup_temp(ofs, workdir); - if (!IS_ERR(whiteout)) { - err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout); - if (err) { - dput(whiteout); - whiteout = ERR_PTR(err); - } - } - inode_unlock(wdir); - if (!IS_ERR(whiteout)) - return whiteout; - if (PTR_ERR(whiteout) != -EMLINK) { - pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%lu)\n", + link = ovl_start_creating_temp(ofs, workdir); + if (IS_ERR(link)) + return link; + err = ovl_do_link(ofs, ofs->whiteout, wdir, link); + if (!err) + whiteout = dget(link); + end_creating(link, workdir); + if (!err) + return whiteout;; + + if (err != -EMLINK) { + pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%u)\n", ofs->whiteout->d_inode->i_nlink, - PTR_ERR(whiteout)); + err); ofs->no_shared_whiteout = true; } } @@ -252,10 +262,13 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr) { struct dentry *ret; - inode_lock_nested(workdir->d_inode, I_MUTEX_PARENT); - ret = ovl_create_real(ofs, workdir, - ovl_lookup_temp(ofs, workdir), attr); - inode_unlock(workdir->d_inode); + ret = ovl_start_creating_temp(ofs, workdir); + if (IS_ERR(ret)) + return ret; + ret = ovl_create_real(ofs, workdir, ret, attr); + if (!IS_ERR(ret)) + dget(ret); + end_creating(ret, workdir); return ret; } @@ -354,18 +367,21 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); - struct inode *udir = upperdir->d_inode; struct dentry *newdentry; int err; - inode_lock_nested(udir, I_MUTEX_PARENT); - newdentry = ovl_create_real(ofs, upperdir, - ovl_lookup_upper(ofs, dentry->d_name.name, - upperdir, dentry->d_name.len), - attr); - inode_unlock(udir); + newdentry = ovl_start_creating_upper(ofs, upperdir, + &QSTR_LEN(dentry->d_name.name, + dentry->d_name.len)); if (IS_ERR(newdentry)) return PTR_ERR(newdentry); + newdentry = ovl_create_real(ofs, upperdir, newdentry, attr); + if (IS_ERR(newdentry)) { + end_creating(newdentry, upperdir); + return PTR_ERR(newdentry); + } + dget(newdentry); + end_creating(newdentry, upperdir); if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) && !ovl_allow_offline_changes(ofs)) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index c8fd5951fc5e..beeba96cfcb2 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -415,6 +415,14 @@ static inline struct dentry *ovl_lookup_upper_unlocked(struct ovl_fs *ofs, &QSTR_LEN(name, len), base); } +static inline struct dentry *ovl_start_creating_upper(struct ovl_fs *ofs, + struct dentry *parent, + struct qstr *name) +{ + return start_creating(ovl_upper_mnt_idmap(ofs), + parent, name); +} + static inline bool ovl_open_flags_need_copy_up(int flags) { if (!flags) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 43ee4c7296a7..6e0816c1147a 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -310,8 +310,7 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, bool retried = false; retry: - inode_lock_nested(dir, I_MUTEX_PARENT); - work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name)); + work = ovl_start_creating_upper(ofs, ofs->workbasedir, &QSTR(name)); if (!IS_ERR(work)) { struct iattr attr = { @@ -320,14 +319,13 @@ retry: }; if (work->d_inode) { + dget(work); + end_creating(work, ofs->workbasedir); + if (persist) + return work; err = -EEXIST; - inode_unlock(dir); if (retried) goto out_dput; - - if (persist) - return work; - retried = true; err = ovl_workdir_cleanup(ofs, ofs->workbasedir, mnt, work, 0); dput(work); @@ -338,7 +336,9 @@ retry: } work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode); - inode_unlock(dir); + if (!IS_ERR(work)) + dget(work); + end_creating(work, ofs->workbasedir); err = PTR_ERR(work); if (IS_ERR(work)) goto out_err; @@ -376,7 +376,6 @@ retry: if (err) goto out_dput; } else { - inode_unlock(dir); err = PTR_ERR(work); goto out_err; } @@ -626,14 +625,17 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs, struct dentry *parent, const char *name, umode_t mode) { - size_t len = strlen(name); struct dentry *child; - inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); - child = ovl_lookup_upper(ofs, name, parent, len); - if (!IS_ERR(child) && !child->d_inode) - child = ovl_create_real(ofs, parent, child, OVL_CATTR(mode)); - inode_unlock(parent->d_inode); + child = ovl_start_creating_upper(ofs, parent, &QSTR(name)); + if (!IS_ERR(child)) { + if (!child->d_inode) + child = ovl_create_real(ofs, parent, child, + OVL_CATTR(mode)); + if (!IS_ERR(child)) + dget(child); + end_creating(child, parent); + } dput(parent); return child; diff --git a/include/linux/namei.h b/include/linux/namei.h index fed86221c69c..3f92c1a16878 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -88,6 +88,39 @@ struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); +struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name); + +/** + * end_creating - finish action started with start_creating + * @child: dentry returned by start_creating() or vfs_mkdir() + * @parent: dentry given to start_creating(), + * + * Unlock and release the child. + * + * Unlike end_dirop() this can only be called if start_creating() succeeded. + * It handles @child being and error as vfs_mkdir() might have converted the + * dentry to an error - in that case the parent still needs to be unlocked. + * + * If vfs_mkdir() was called then the value returned from that function + * should be given for @child rather than the original dentry, as vfs_mkdir() + * may have provided a new dentry. Even if vfs_mkdir() returns an error + * it must be given to end_creating(). + * + * If vfs_mkdir() was not called, then @child will be a valid dentry and + * @parent will be ignored. + */ +static inline void end_creating(struct dentry *child, struct dentry *parent) +{ + if (IS_ERR(child)) + /* The parent is still locked despite the error from + * vfs_mkdir() - must unlock it. + */ + inode_unlock(parent->d_inode); + else + end_dirop(child); +} + extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); -- cgit v1.2.3 From bd6ede8a06e89ca5a94a8b51cea792705d1b8ca2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:28 +1100 Subject: VFS/nfsd/cachefiles/ovl: introduce start_removing() and end_removing() start_removing() is similar to start_creating() but will only return a positive dentry with the expectation that it will be removed. This is used by nfsd, cachefiles, and overlayfs. They are changed to also use end_removing() to terminate the action begun by start_removing(). This is a simple alias for end_dirop(). Apart from changes to the error paths, as we no longer need to unlock on a lookup error, an effect on callers is that they don't need to test if the found dentry is positive or negative - they can be sure it is positive. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-6-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/cachefiles/namei.c | 32 ++++++++++++++------------------ fs/namei.c | 27 +++++++++++++++++++++++++++ fs/nfsd/nfs4recover.c | 18 +++++------------- fs/nfsd/vfs.c | 26 ++++++++++---------------- fs/overlayfs/dir.c | 15 +++++++-------- fs/overlayfs/overlayfs.h | 8 ++++++++ include/linux/namei.h | 18 ++++++++++++++++++ 7 files changed, 89 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 0a136eb434da..c7f0c6ab9b88 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -260,6 +260,7 @@ static int cachefiles_unlink(struct cachefiles_cache *cache, * - File backed objects are unlinked * - Directory backed objects are stuffed into the graveyard for userspace to * delete + * On entry dir must be locked. It will be unlocked on exit. */ int cachefiles_bury_object(struct cachefiles_cache *cache, struct cachefiles_object *object, @@ -274,28 +275,30 @@ int cachefiles_bury_object(struct cachefiles_cache *cache, _enter(",'%pd','%pd'", dir, rep); + /* end_removing() will dput() @rep but we need to keep + * a ref, so take one now. This also stops the dentry + * being negated when unlinked which we need. + */ + dget(rep); + if (rep->d_parent != dir) { - inode_unlock(d_inode(dir)); + end_removing(rep); _leave(" = -ESTALE"); return -ESTALE; } /* non-directories can just be unlinked */ if (!d_is_dir(rep)) { - dget(rep); /* Stop the dentry being negated if it's only pinned - * by a file struct. - */ ret = cachefiles_unlink(cache, object, dir, rep, why); - dput(rep); + end_removing(rep); - inode_unlock(d_inode(dir)); _leave(" = %d", ret); return ret; } /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - inode_unlock(d_inode(dir)); + end_removing(rep); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -749,26 +752,20 @@ static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache, struct dentry *victim; int ret = -ENOENT; - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); + victim = start_removing(&nop_mnt_idmap, dir, &QSTR(filename)); - victim = lookup_one(&nop_mnt_idmap, &QSTR(filename), dir); if (IS_ERR(victim)) goto lookup_error; - if (d_is_negative(victim)) - goto lookup_put; if (d_inode(victim)->i_flags & S_KERNEL_FILE) goto lookup_busy; return victim; lookup_busy: ret = -EBUSY; -lookup_put: - inode_unlock(d_inode(dir)); - dput(victim); + end_removing(victim); return ERR_PTR(ret); lookup_error: - inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); if (ret == -ENOENT) return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */ @@ -816,18 +813,17 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, ret = cachefiles_bury_object(cache, NULL, dir, victim, FSCACHE_OBJECT_WAS_CULLED); + dput(victim); if (ret < 0) goto error; fscache_count_culled(); - dput(victim); _leave(" = 0"); return 0; error_unlock: - inode_unlock(d_inode(dir)); + end_removing(victim); error: - dput(victim); if (ret == -ENOENT) return -ESTALE; /* Probably got retired by the netfs */ diff --git a/fs/namei.c b/fs/namei.c index 9972b0257a4c..ae833dfa277c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3248,6 +3248,33 @@ struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, } EXPORT_SYMBOL(start_creating); +/** + * start_removing - prepare to remove a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, 0); +} +EXPORT_SYMBOL(start_removing); + #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index c247a7c3291c..3eefaa2202e3 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -324,20 +324,12 @@ nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %s\n", name); dir = nn->rec_file->f_path.dentry; - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - dentry = lookup_one(&nop_mnt_idmap, &QSTR(name), dir); - if (IS_ERR(dentry)) { - status = PTR_ERR(dentry); - goto out_unlock; - } - status = -ENOENT; - if (d_really_is_negative(dentry)) - goto out; + dentry = start_removing(&nop_mnt_idmap, dir, &QSTR(name)); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry); -out: - dput(dentry); -out_unlock: - inode_unlock(d_inode(dir)); + end_removing(dentry); return status; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4efd3688e081..cd64ffe12e0b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2044,7 +2044,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, { struct dentry *dentry, *rdentry; struct inode *dirp; - struct inode *rinode; + struct inode *rinode = NULL; __be32 err; int host_err; @@ -2063,24 +2063,21 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dentry = fhp->fh_dentry; dirp = d_inode(dentry); - inode_lock_nested(dirp, I_MUTEX_PARENT); - rdentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); + rdentry = start_removing(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); + host_err = PTR_ERR(rdentry); if (IS_ERR(rdentry)) - goto out_unlock; + goto out_drop_write; - if (d_really_is_negative(rdentry)) { - dput(rdentry); - host_err = -ENOENT; - goto out_unlock; - } - rinode = d_inode(rdentry); err = fh_fill_pre_attrs(fhp); if (err != nfs_ok) goto out_unlock; + rinode = d_inode(rdentry); + /* Prevent truncation until after locks dropped */ ihold(rinode); + if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; @@ -2102,10 +2099,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, } fh_fill_post_attrs(fhp); - inode_unlock(dirp); - if (!host_err) +out_unlock: + end_removing(rdentry); + if (!err && !host_err) host_err = commit_metadata(fhp); - dput(rdentry); iput(rinode); /* truncate the inode here */ out_drop_write: @@ -2123,9 +2120,6 @@ out_nfserr: } out: return err != nfs_ok ? err : nfserrno(host_err); -out_unlock: - inode_unlock(dirp); - goto out_drop_write; } /* diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index f0728547f7d7..f0b6e2e7c9d4 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -866,17 +866,17 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, goto out; } - inode_lock_nested(dir, I_MUTEX_PARENT); - upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_start_removing_upper(ofs, upperdir, + &QSTR_LEN(dentry->d_name.name, + dentry->d_name.len)); err = PTR_ERR(upper); if (IS_ERR(upper)) - goto out_unlock; + goto out_dput; err = -ESTALE; if ((opaquedir && upper != opaquedir) || (!opaquedir && !ovl_matches_upper(dentry, upper))) - goto out_dput_upper; + goto out_unlock; if (is_dir) err = ovl_do_rmdir(ofs, dir, upper); @@ -892,10 +892,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, */ if (!err) d_drop(dentry); -out_dput_upper: - dput(upper); out_unlock: - inode_unlock(dir); + end_removing(upper); +out_dput: dput(opaquedir); out: return err; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index beeba96cfcb2..49ad65f829dc 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -423,6 +423,14 @@ static inline struct dentry *ovl_start_creating_upper(struct ovl_fs *ofs, parent, name); } +static inline struct dentry *ovl_start_removing_upper(struct ovl_fs *ofs, + struct dentry *parent, + struct qstr *name) +{ + return start_removing(ovl_upper_mnt_idmap(ofs), + parent, name); +} + static inline bool ovl_open_flags_need_copy_up(int flags) { if (!flags) diff --git a/include/linux/namei.h b/include/linux/namei.h index 3f92c1a16878..9ee76e88f3dd 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -90,6 +90,8 @@ struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); +struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name); /** * end_creating - finish action started with start_creating @@ -121,6 +123,22 @@ static inline void end_creating(struct dentry *child, struct dentry *parent) end_dirop(child); } +/** + * end_removing - finish action started with start_removing + * @child: dentry returned by start_removing() + * @parent: dentry given to start_removing() + * + * Unlock and release the child. + * + * This is identical to end_dirop(). It can be passed the result of + * start_removing() whether that was successful or not, but it not needed + * if start_removing() failed. + */ +static inline void end_removing(struct dentry *child) +{ + end_dirop(child); +} + extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); -- cgit v1.2.3 From c9ba789dad15ba65662bba17595c0aeaa0cfcf1c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:29 +1100 Subject: VFS: introduce start_creating_noperm() and start_removing_noperm() xfs, fuse, ipc/mqueue need variants of start_creating or start_removing which do not check permissions. This patch adds _noperm versions of these functions. Note that do_mq_open() was only calling mntget() so it could call path_put() - it didn't really need an extra reference on the mnt. Now it doesn't call mntget() and uses end_creating() which does the dput() half of path_put(). Also mq_unlink() previously passed d_inode(dentry->d_parent) as the dir inode to vfs_unlink(). This is after locking d_inode(mnt->mnt_root) These two inodes are the same, but normally calls use the textual parent. So I've changes the vfs_unlink() call to be given d_inode(mnt->mnt_root). Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown -- changes since v2: - dir arg passed to vfs_unlink() in mq_unlink() changed to match the dir passed to lookup_noperm() - restore assignment to path->mnt even though the mntget() is removed. Link: https://patch.msgid.link/20251113002050.676694-7-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 19 ++++++++----------- fs/namei.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/orphanage.c | 11 ++++------- include/linux/namei.h | 2 ++ ipc/mqueue.c | 32 ++++++++++++-------------------- 5 files changed, 74 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ecaec0fea3a1..40ca94922349 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1397,27 +1397,25 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, if (!parent) return -ENOENT; - inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) - goto unlock; + goto put_parent; err = -ENOENT; dir = d_find_alias(parent); if (!dir) - goto unlock; + goto put_parent; - name->hash = full_name_hash(dir, name->name, name->len); - entry = d_lookup(dir, name); + entry = start_removing_noperm(dir, name); dput(dir); - if (!entry) - goto unlock; + if (IS_ERR(entry)) + goto put_parent; fuse_dir_changed(parent); if (!(flags & FUSE_EXPIRE_ONLY)) d_invalidate(entry); fuse_invalidate_entry_cache(entry); - if (child_nodeid != 0 && d_really_is_positive(entry)) { + if (child_nodeid != 0) { inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; @@ -1445,10 +1443,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, } else { err = 0; } - dput(entry); - unlock: - inode_unlock(parent); + end_removing(entry); + put_parent: iput(parent); return err; } diff --git a/fs/namei.c b/fs/namei.c index ae833dfa277c..696e4b794416 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3275,6 +3275,54 @@ struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, } EXPORT_SYMBOL(start_removing); +/** + * start_creating_noperm - prepare to create a given name without permission checking + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup in performed prior to creating + * an object in a directory. + * + * If the name already exists, a positive dentry is returned. + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating_noperm(struct dentry *parent, + struct qstr *name) +{ + int err = lookup_noperm_common(name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, LOOKUP_CREATE); +} +EXPORT_SYMBOL(start_creating_noperm); + +/** + * start_removing_noperm - prepare to remove a given name without permission checking + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing_noperm(struct dentry *parent, + struct qstr *name) +{ + int err = lookup_noperm_common(name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, 0); +} +EXPORT_SYMBOL(start_removing_noperm); + #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 9c12cb844231..e732605924a1 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -152,11 +152,10 @@ xrep_orphanage_create( } /* Try to find the orphanage directory. */ - inode_lock_nested(root_inode, I_MUTEX_PARENT); - orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry); + orphanage_dentry = start_creating_noperm(root_dentry, &QSTR(ORPHANAGE)); if (IS_ERR(orphanage_dentry)) { error = PTR_ERR(orphanage_dentry); - goto out_unlock_root; + goto out_dput_root; } /* @@ -170,7 +169,7 @@ xrep_orphanage_create( orphanage_dentry, 0750); error = PTR_ERR(orphanage_dentry); if (IS_ERR(orphanage_dentry)) - goto out_unlock_root; + goto out_dput_orphanage; } /* Not a directory? Bail out. */ @@ -200,9 +199,7 @@ xrep_orphanage_create( sc->orphanage_ilock_flags = 0; out_dput_orphanage: - dput(orphanage_dentry); -out_unlock_root: - inode_unlock(VFS_I(sc->mp->m_rootip)); + end_creating(orphanage_dentry, root_dentry); out_dput_root: dput(root_dentry); out: diff --git a/include/linux/namei.h b/include/linux/namei.h index 9ee76e88f3dd..688e157d6afc 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -92,6 +92,8 @@ struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); +struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); +struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); /** * end_creating - finish action started with start_creating diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 093551fe66a7..6d7610310003 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -913,13 +913,12 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, goto out_putname; ro = mnt_want_write(mnt); /* we'll drop it in any case */ - inode_lock(d_inode(root)); - path.dentry = lookup_noperm(&QSTR(name->name), root); + path.dentry = start_creating_noperm(root, &QSTR(name->name)); if (IS_ERR(path.dentry)) { error = PTR_ERR(path.dentry); goto out_putfd; } - path.mnt = mntget(mnt); + path.mnt = mnt; error = prepare_open(path.dentry, oflag, ro, mode, name, attr); if (!error) { struct file *file = dentry_open(&path, oflag, current_cred()); @@ -928,13 +927,12 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, else error = PTR_ERR(file); } - path_put(&path); out_putfd: if (error) { put_unused_fd(fd); fd = error; } - inode_unlock(d_inode(root)); + end_creating(path.dentry, root); if (!ro) mnt_drop_write(mnt); out_putname: @@ -957,7 +955,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) int err; struct filename *name; struct dentry *dentry; - struct inode *inode = NULL; + struct inode *inode; struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; struct vfsmount *mnt = ipc_ns->mq_mnt; @@ -969,26 +967,20 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = mnt_want_write(mnt); if (err) goto out_name; - inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT); - dentry = lookup_noperm(&QSTR(name->name), mnt->mnt_root); + dentry = start_removing_noperm(mnt->mnt_root, &QSTR(name->name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); - goto out_unlock; + goto out_drop_write; } inode = d_inode(dentry); - if (!inode) { - err = -ENOENT; - } else { - ihold(inode); - err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent), - dentry, NULL); - } - dput(dentry); - -out_unlock: - inode_unlock(d_inode(mnt->mnt_root)); + ihold(inode); + err = vfs_unlink(&nop_mnt_idmap, d_inode(mnt->mnt_root), + dentry, NULL); + end_removing(dentry); iput(inode); + +out_drop_write: mnt_drop_write(mnt); out_name: putname(name); -- cgit v1.2.3 From 7bb1eb45e43c4730cbc5a48b9e9295049fccdacb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:31 +1100 Subject: VFS: introduce start_removing_dentry() start_removing_dentry() is similar to start_removing() but instead of providing a name for lookup, the target dentry is given. start_removing_dentry() checks that the dentry is still hashed and in the parent, and if so it locks and increases the refcount so that end_removing() can be used to finish the operation. This is used in cachefiles, overlayfs, smb/server, and apparmor. There will be other users including ecryptfs. As start_removing_dentry() takes an extra reference to the dentry (to be put by end_removing()), there is no need to explicitly take an extra reference to stop d_delete() from using dentry_unlink_inode() to negate the dentry - as in cachefiles_delete_object(), and ksmbd_vfs_unlink(). cachefiles_bury_object() now gets an extra ref to the victim, which is drops. As it includes the needed end_removing() calls, the caller doesn't need them. Reviewed-by: Amir Goldstein Reviewed-by: Namjae Jeon Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-9-neilb@ownmail.net Signed-off-by: Christian Brauner --- fs/cachefiles/interface.c | 11 +++++++---- fs/cachefiles/namei.c | 30 ++++++++++++++---------------- fs/cachefiles/volume.c | 9 ++++++--- fs/namei.c | 33 +++++++++++++++++++++++++++++++++ fs/overlayfs/dir.c | 10 ++++------ fs/overlayfs/readdir.c | 8 ++++---- fs/smb/server/vfs.c | 27 ++++----------------------- include/linux/namei.h | 2 ++ security/apparmor/apparmorfs.c | 8 ++++---- 9 files changed, 78 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 3e63cfe15874..a08250d244ea 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -428,11 +429,13 @@ static bool cachefiles_invalidate_cookie(struct fscache_cookie *cookie) if (!old_tmpfile) { struct cachefiles_volume *volume = object->volume; struct dentry *fan = volume->fanout[(u8)cookie->key_hash]; + struct dentry *obj; - inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); - cachefiles_bury_object(volume->cache, object, fan, - old_file->f_path.dentry, - FSCACHE_OBJECT_INVALIDATED); + obj = start_removing_dentry(fan, old_file->f_path.dentry); + if (!IS_ERR(obj)) + cachefiles_bury_object(volume->cache, object, + fan, obj, + FSCACHE_OBJECT_INVALIDATED); } fput(old_file); } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index c7f0c6ab9b88..0104ac00485d 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -261,6 +261,7 @@ static int cachefiles_unlink(struct cachefiles_cache *cache, * - Directory backed objects are stuffed into the graveyard for userspace to * delete * On entry dir must be locked. It will be unlocked on exit. + * On entry there must be at least 2 refs on rep, one will be dropped on exit. */ int cachefiles_bury_object(struct cachefiles_cache *cache, struct cachefiles_object *object, @@ -275,12 +276,6 @@ int cachefiles_bury_object(struct cachefiles_cache *cache, _enter(",'%pd','%pd'", dir, rep); - /* end_removing() will dput() @rep but we need to keep - * a ref, so take one now. This also stops the dentry - * being negated when unlinked which we need. - */ - dget(rep); - if (rep->d_parent != dir) { end_removing(rep); _leave(" = -ESTALE"); @@ -425,13 +420,12 @@ int cachefiles_delete_object(struct cachefiles_object *object, _enter(",OBJ%x{%pD}", object->debug_id, object->file); - /* Stop the dentry being negated if it's only pinned by a file struct. */ - dget(dentry); - - inode_lock_nested(d_backing_inode(fan), I_MUTEX_PARENT); - ret = cachefiles_unlink(volume->cache, object, fan, dentry, why); - inode_unlock(d_backing_inode(fan)); - dput(dentry); + dentry = start_removing_dentry(fan, dentry); + if (IS_ERR(dentry)) + ret = PTR_ERR(dentry); + else + ret = cachefiles_unlink(volume->cache, object, fan, dentry, why); + end_removing(dentry); return ret; } @@ -644,9 +638,13 @@ bool cachefiles_look_up_object(struct cachefiles_object *object) if (!d_is_reg(dentry)) { pr_err("%pd is not a file\n", dentry); - inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); - ret = cachefiles_bury_object(volume->cache, object, fan, dentry, - FSCACHE_OBJECT_IS_WEIRD); + struct dentry *de = start_removing_dentry(fan, dentry); + if (IS_ERR(de)) + ret = PTR_ERR(de); + else + ret = cachefiles_bury_object(volume->cache, object, + fan, de, + FSCACHE_OBJECT_IS_WEIRD); dput(dentry); if (ret < 0) return false; diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c index 781aac4ef274..90ba926f488e 100644 --- a/fs/cachefiles/volume.c +++ b/fs/cachefiles/volume.c @@ -7,6 +7,7 @@ #include #include +#include #include "internal.h" #include @@ -58,9 +59,11 @@ retry: if (ret < 0) { if (ret != -ESTALE) goto error_dir; - inode_lock_nested(d_inode(cache->store), I_MUTEX_PARENT); - cachefiles_bury_object(cache, NULL, cache->store, vdentry, - FSCACHE_VOLUME_IS_WEIRD); + vdentry = start_removing_dentry(cache->store, vdentry); + if (!IS_ERR(vdentry)) + cachefiles_bury_object(cache, NULL, cache->store, + vdentry, + FSCACHE_VOLUME_IS_WEIRD); cachefiles_put_directory(volume->dentry); cond_resched(); goto retry; diff --git a/fs/namei.c b/fs/namei.c index 696e4b794416..bfc443bec8a9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3323,6 +3323,39 @@ struct dentry *start_removing_noperm(struct dentry *parent, } EXPORT_SYMBOL(start_removing_noperm); +/** + * start_removing_dentry - prepare to remove a given dentry + * @parent: directory from which dentry should be removed + * @child: the dentry to be removed + * + * A lock is taken to protect the dentry again other dirops and + * the validity of the dentry is checked: correct parent and still hashed. + * + * If the dentry is valid and positive, a reference is taken and + * returned. If not an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: the valid dentry, or an error. + */ +struct dentry *start_removing_dentry(struct dentry *parent, + struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (unlikely(IS_DEADDIR(parent->d_inode) || + child->d_parent != parent || + d_unhashed(child))) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EINVAL); + } + if (d_is_negative(child)) { + inode_unlock(parent->d_inode); + return ERR_PTR(-ENOENT); + } + return dget(child); +} +EXPORT_SYMBOL(start_removing_dentry); + #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index f0b6e2e7c9d4..82b6ff0ab2d3 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -47,14 +47,12 @@ static int ovl_cleanup_locked(struct ovl_fs *ofs, struct inode *wdir, int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *wdentry) { - int err; - - err = ovl_parent_lock(workdir, wdentry); - if (err) - return err; + wdentry = start_removing_dentry(workdir, wdentry); + if (IS_ERR(wdentry)) + return PTR_ERR(wdentry); ovl_cleanup_locked(ofs, workdir->d_inode, wdentry); - ovl_parent_unlock(workdir); + end_removing(wdentry); return 0; } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 1e9792cc557b..77ecc39fc33a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -1242,11 +1242,11 @@ int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent, if (!d_is_dir(dentry) || level > 1) return ovl_cleanup(ofs, parent, dentry); - err = ovl_parent_lock(parent, dentry); - if (err) - return err; + dentry = start_removing_dentry(parent, dentry); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); err = ovl_do_rmdir(ofs, parent->d_inode, dentry); - ovl_parent_unlock(parent); + end_removing(dentry); if (err) { struct path path = { .mnt = mnt, .dentry = dentry }; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index ea0a06b0ae44..148c65d59e42 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -49,24 +49,6 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, i_uid_write(inode, i_uid_read(parent_inode)); } -/** - * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable - * @parent: parent dentry - * @child: child dentry - * - * Returns: %0 on success, %-ENOENT if the parent dentry is not stable - */ -int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) -{ - inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - if (child->d_parent != parent) { - inode_unlock(d_inode(parent)); - return -ENOENT; - } - - return 0; -} - static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf, char *pathname, unsigned int flags, struct path *path, bool for_remove) @@ -1082,18 +1064,17 @@ int ksmbd_vfs_unlink(struct file *filp) return err; dir = dget_parent(dentry); - err = ksmbd_vfs_lock_parent(dir, dentry); - if (err) + dentry = start_removing_dentry(dir, dentry); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) goto out; - dget(dentry); if (S_ISDIR(d_inode(dentry)->i_mode)) err = vfs_rmdir(idmap, d_inode(dir), dentry); else err = vfs_unlink(idmap, d_inode(dir), dentry, NULL); - dput(dentry); - inode_unlock(d_inode(dir)); + end_removing(dentry); if (err) ksmbd_debug(VFS, "failed to delete, err %d\n", err); out: diff --git a/include/linux/namei.h b/include/linux/namei.h index 688e157d6afc..7e916e9d7726 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -94,6 +94,8 @@ struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); +struct dentry *start_removing_dentry(struct dentry *parent, + struct dentry *child); /** * end_creating - finish action started with start_creating diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 391a586d0557..9d08d103f142 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -355,17 +355,17 @@ static void aafs_remove(struct dentry *dentry) if (!dentry || IS_ERR(dentry)) return; + /* ->d_parent is stable as rename is not supported */ dir = d_inode(dentry->d_parent); - inode_lock(dir); - if (simple_positive(dentry)) { + dentry = start_removing_dentry(dentry->d_parent, dentry); + if (!IS_ERR(dentry) && simple_positive(dentry)) { if (d_is_dir(dentry)) simple_rmdir(dir, dentry); else simple_unlink(dir, dentry); d_delete(dentry); - dput(dentry); } - inode_unlock(dir); + end_removing(dentry); simple_release_fs(&aafs_mnt, &aafs_count); } -- cgit v1.2.3 From ff7c4ea11a05c886f018fff4a4d4f4d68d951e25 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:32 +1100 Subject: VFS: add start_creating_killable() and start_removing_killable() These are similar to start_creating() and start_removing(), but allow a fatal signal to abort waiting for the lock. They are used in btrfs for subvol creation and removal. btrfs_may_create() no longer needs IS_DEADDIR() and start_creating_killable() includes that check. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-10-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/btrfs/ioctl.c | 41 ++++++++------------------ fs/namei.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/namei.h | 6 ++++ 3 files changed, 95 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 185bef0df1c2..4fbfdd8faf6a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -904,14 +904,9 @@ static noinline int btrfs_mksubvol(struct dentry *parent, struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len); int ret; - ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (ret == -EINTR) - return ret; - - dentry = lookup_one(idmap, qname, parent); - ret = PTR_ERR(dentry); + dentry = start_creating_killable(idmap, parent, qname); if (IS_ERR(dentry)) - goto out_unlock; + return PTR_ERR(dentry); ret = btrfs_may_create(idmap, dir, dentry); if (ret) @@ -940,9 +935,7 @@ static noinline int btrfs_mksubvol(struct dentry *parent, out_up_read: up_read(&fs_info->subvol_sem); out_dput: - dput(dentry); -out_unlock: - btrfs_inode_unlock(BTRFS_I(dir), 0); + end_creating(dentry, parent); return ret; } @@ -2417,18 +2410,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto free_subvol_name; } - ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (ret == -EINTR) - goto free_subvol_name; - dentry = lookup_one(idmap, &QSTR(subvol_name), parent); + dentry = start_removing_killable(idmap, parent, &QSTR(subvol_name)); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); - goto out_unlock_dir; - } - - if (d_really_is_negative(dentry)) { - ret = -ENOENT; - goto out_dput; + goto out_end_removing; } inode = d_inode(dentry); @@ -2449,7 +2434,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ ret = -EPERM; if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) - goto out_dput; + goto out_end_removing; /* * Do not allow deletion if the parent dir is the same @@ -2460,21 +2445,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ ret = -EINVAL; if (root == dest) - goto out_dput; + goto out_end_removing; ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); if (ret) - goto out_dput; + goto out_end_removing; } /* check if subvolume may be deleted by a user */ ret = btrfs_may_delete(idmap, dir, dentry, 1); if (ret) - goto out_dput; + goto out_end_removing; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; - goto out_dput; + goto out_end_removing; } btrfs_inode_lock(BTRFS_I(inode), 0); @@ -2483,10 +2468,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (!ret) d_delete_notify(dir, dentry); -out_dput: - dput(dentry); -out_unlock_dir: - btrfs_inode_unlock(BTRFS_I(dir), 0); +out_end_removing: + end_removing(dentry); free_subvol_name: kfree(subvol_name_ptr); free_parent: diff --git a/fs/namei.c b/fs/namei.c index bfc443bec8a9..04d2819bd351 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2778,19 +2778,33 @@ static int filename_parentat(int dfd, struct filename *name, * Returns: a locked dentry, or an error. * */ -struct dentry *start_dirop(struct dentry *parent, struct qstr *name, - unsigned int lookup_flags) +static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags, + unsigned int state) { struct dentry *dentry; struct inode *dir = d_inode(parent); - inode_lock_nested(dir, I_MUTEX_PARENT); + if (state == TASK_KILLABLE) { + int ret = down_write_killable_nested(&dir->i_rwsem, + I_MUTEX_PARENT); + if (ret) + return ERR_PTR(ret); + } else { + inode_lock_nested(dir, I_MUTEX_PARENT); + } dentry = lookup_one_qstr_excl(name, parent, lookup_flags); if (IS_ERR(dentry)) inode_unlock(dir); return dentry; } +struct dentry *start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags) +{ + return __start_dirop(parent, name, lookup_flags, TASK_NORMAL); +} + /** * end_dirop - signal completion of a dirop * @de: the dentry which was returned by start_dirop or similar. @@ -3275,6 +3289,66 @@ struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, } EXPORT_SYMBOL(start_removing); +/** + * start_creating_killable - prepare to create a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup in performed prior to creating + * an object in a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name already exists, a positive dentry is returned. + * + * If a signal is received or was already pending, the function aborts + * with -EINTR; + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE); +} +EXPORT_SYMBOL(start_creating_killable); + +/** + * start_removing_killable - prepare to remove a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * If a signal is received or was already pending, the function aborts + * with -EINTR; + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return __start_dirop(parent, name, 0, TASK_KILLABLE); +} +EXPORT_SYMBOL(start_removing_killable); + /** * start_creating_noperm - prepare to create a given name without permission checking * @parent: directory in which to prepare to create the name diff --git a/include/linux/namei.h b/include/linux/namei.h index 7e916e9d7726..e5cff89679df 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -92,6 +92,12 @@ struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); +struct dentry *start_creating_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name); +struct dentry *start_removing_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name); struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); struct dentry *start_removing_dentry(struct dentry *parent, -- cgit v1.2.3 From 5c8752729970cc2323ba86817254749f7f21f163 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:33 +1100 Subject: VFS/nfsd/ovl: introduce start_renaming() and end_renaming() start_renaming() combines name lookup and locking to prepare for rename. It is used when two names need to be looked up as in nfsd and overlayfs - cases where one or both dentries are already available will be handled separately. __start_renaming() avoids the inode_permission check and hash calculation and is suitable after filename_parentat() in do_renameat2(). It subsumes quite a bit of code from that function. start_renaming() does calculate the hash and check X permission and is suitable elsewhere: - nfsd_rename() - ovl_rename() In ovl, ovl_do_rename_rd() is factored out of ovl_do_rename(), which itself will be gone by the end of the series. Acked-by: Chuck Lever (for nfsd parts) Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown -- Changes since v3: - added missig dput() in ovl_rename when "whiteout" is not-NULL. Changes since v2: - in __start_renaming() some label have been renamed, and err is always set before a "goto out_foo" rather than passing the error in a dentry*. - ovl_do_rename() changed to call the new ovl_do_rename_rd() rather than keeping duplicate code - code around ovl_cleanup() call in ovl_rename() restructured. Link: https://patch.msgid.link/20251113002050.676694-11-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Acked-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/namei.c | 197 ++++++++++++++++++++++++++++++++++------------- fs/nfsd/vfs.c | 73 ++++++------------ fs/overlayfs/dir.c | 74 ++++++++---------- fs/overlayfs/overlayfs.h | 23 ++++-- include/linux/namei.h | 3 + 5 files changed, 218 insertions(+), 152 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 04d2819bd351..0ee0a110b088 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3667,6 +3667,129 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } EXPORT_SYMBOL(unlock_rename); +/** + * __start_renaming - lookup and lock names for rename + * @rd: rename data containing parent and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_last: name of object in @rd.old_parent + * @new_last: name of object in @rd.new_parent + * + * Look up two names and ensure locks are in place for + * rename. + * + * On success the found dentries are stored in @rd.old_dentry, + * @rd.new_dentry. These references and the lock are dropped by + * end_renaming(). + * + * The passed in qstrs must have the hash calculated, and no permission + * checking is performed. + * + * Returns: zero or an error. + */ +static int +__start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last) +{ + struct dentry *trap; + struct dentry *d1, *d2; + int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + int err; + + if (rd->flags & RENAME_EXCHANGE) + target_flags = 0; + if (rd->flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; + + trap = lock_rename(rd->old_parent, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + + d1 = lookup_one_qstr_excl(old_last, rd->old_parent, + lookup_flags); + err = PTR_ERR(d1); + if (IS_ERR(d1)) + goto out_unlock; + + d2 = lookup_one_qstr_excl(new_last, rd->new_parent, + lookup_flags | target_flags); + err = PTR_ERR(d2); + if (IS_ERR(d2)) + goto out_dput_d1; + + if (d1 == trap) { + /* source is an ancestor of target */ + err = -EINVAL; + goto out_dput_d2; + } + + if (d2 == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_dput_d2; + } + + rd->old_dentry = d1; + rd->new_dentry = d2; + return 0; + +out_dput_d2: + dput(d2); +out_dput_d1: + dput(d1); +out_unlock: + unlock_rename(rd->old_parent, rd->new_parent); + return err; +} + +/** + * start_renaming - lookup and lock names for rename with permission checking + * @rd: rename data containing parent and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_last: name of object in @rd.old_parent + * @new_last: name of object in @rd.new_parent + * + * Look up two names and ensure locks are in place for + * rename. + * + * On success the found dentries are stored in @rd.old_dentry, + * @rd.new_dentry. These references and the lock are dropped by + * end_renaming(). + * + * The passed in qstrs need not have the hash calculated, and basic + * eXecute permission checking is performed against @rd.mnt_idmap. + * + * Returns: zero or an error. + */ +int start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last) +{ + int err; + + err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent); + if (err) + return err; + err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); + if (err) + return err; + return __start_renaming(rd, lookup_flags, old_last, new_last); +} +EXPORT_SYMBOL(start_renaming); + +void end_renaming(struct renamedata *rd) +{ + unlock_rename(rd->old_parent, rd->new_parent); + dput(rd->old_dentry); + dput(rd->new_dentry); +} +EXPORT_SYMBOL(end_renaming); + /** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from @@ -5504,14 +5627,11 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { struct renamedata rd; - struct dentry *old_dentry, *new_dentry; - struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; - unsigned int lookup_flags = 0, target_flags = - LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + unsigned int lookup_flags = 0; bool should_retry = false; int error = -EINVAL; @@ -5522,11 +5642,6 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, (flags & RENAME_EXCHANGE)) goto put_names; - if (flags & RENAME_EXCHANGE) - target_flags = 0; - if (flags & RENAME_NOREPLACE) - target_flags |= LOOKUP_EXCL; - retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); @@ -5556,66 +5671,40 @@ retry: goto exit2; retry_deleg: - trap = lock_rename(new_path.dentry, old_path.dentry); - if (IS_ERR(trap)) { - error = PTR_ERR(trap); + rd.old_parent = old_path.dentry; + rd.mnt_idmap = mnt_idmap(old_path.mnt); + rd.new_parent = new_path.dentry; + rd.delegated_inode = &delegated_inode; + rd.flags = flags; + + error = __start_renaming(&rd, lookup_flags, &old_last, &new_last); + if (error) goto exit_lock_rename; - } - old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, - lookup_flags); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; - new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, - lookup_flags | target_flags); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; if (flags & RENAME_EXCHANGE) { - if (!d_is_dir(new_dentry)) { + if (!d_is_dir(rd.new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) - goto exit5; + goto exit_unlock; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!d_is_dir(old_dentry)) { + if (!d_is_dir(rd.old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) - goto exit5; + goto exit_unlock; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) - goto exit5; - } - /* source should not be ancestor of target */ - error = -EINVAL; - if (old_dentry == trap) - goto exit5; - /* target should not be an ancestor of source */ - if (!(flags & RENAME_EXCHANGE)) - error = -ENOTEMPTY; - if (new_dentry == trap) - goto exit5; + goto exit_unlock; + } - error = security_path_rename(&old_path, old_dentry, - &new_path, new_dentry, flags); + error = security_path_rename(&old_path, rd.old_dentry, + &new_path, rd.new_dentry, flags); if (error) - goto exit5; + goto exit_unlock; - rd.old_parent = old_path.dentry; - rd.old_dentry = old_dentry; - rd.mnt_idmap = mnt_idmap(old_path.mnt); - rd.new_parent = new_path.dentry; - rd.new_dentry = new_dentry; - rd.delegated_inode = &delegated_inode; - rd.flags = flags; error = vfs_rename(&rd); -exit5: - dput(new_dentry); -exit4: - dput(old_dentry); -exit3: - unlock_rename(new_path.dentry, old_path.dentry); +exit_unlock: + end_renaming(&rd); exit_lock_rename: if (delegated_inode) { error = break_deleg_wait(&delegated_inode); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index cd64ffe12e0b..62109885d4db 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1885,11 +1885,12 @@ __be32 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct svc_fh *tfhp, char *tname, int tlen) { - struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; + struct dentry *fdentry, *tdentry; int type = S_IFDIR; + struct renamedata rd = {}; __be32 err; int host_err; - bool close_cached = false; + struct dentry *close_cached; trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen); @@ -1915,15 +1916,22 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out; retry: + close_cached = NULL; host_err = fh_want_write(ffhp); if (host_err) { err = nfserrno(host_err); goto out; } - trap = lock_rename(tdentry, fdentry); - if (IS_ERR(trap)) { - err = nfserr_xdev; + rd.mnt_idmap = &nop_mnt_idmap; + rd.old_parent = fdentry; + rd.new_parent = tdentry; + + host_err = start_renaming(&rd, 0, &QSTR_LEN(fname, flen), + &QSTR_LEN(tname, tlen)); + + if (host_err) { + err = nfserrno(host_err); goto out_want_write; } err = fh_fill_pre_attrs(ffhp); @@ -1933,48 +1941,23 @@ retry: if (err != nfs_ok) goto out_unlock; - odentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), fdentry); - host_err = PTR_ERR(odentry); - if (IS_ERR(odentry)) - goto out_nfserr; + type = d_inode(rd.old_dentry)->i_mode & S_IFMT; + + if (d_inode(rd.new_dentry)) + type = d_inode(rd.new_dentry)->i_mode & S_IFMT; - host_err = -ENOENT; - if (d_really_is_negative(odentry)) - goto out_dput_old; - host_err = -EINVAL; - if (odentry == trap) - goto out_dput_old; - type = d_inode(odentry)->i_mode & S_IFMT; - - ndentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(tname, tlen), tdentry); - host_err = PTR_ERR(ndentry); - if (IS_ERR(ndentry)) - goto out_dput_old; - if (d_inode(ndentry)) - type = d_inode(ndentry)->i_mode & S_IFMT; - host_err = -ENOTEMPTY; - if (ndentry == trap) - goto out_dput_new; - - if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && - nfsd_has_cached_files(ndentry)) { - close_cached = true; - goto out_dput_old; + if ((rd.new_dentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && + nfsd_has_cached_files(rd.new_dentry)) { + close_cached = dget(rd.new_dentry); + goto out_unlock; } else { - struct renamedata rd = { - .mnt_idmap = &nop_mnt_idmap, - .old_parent = fdentry, - .old_dentry = odentry, - .new_parent = tdentry, - .new_dentry = ndentry, - }; int retries; for (retries = 1;;) { host_err = vfs_rename(&rd); if (host_err != -EAGAIN || !retries--) break; - if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry))) + if (!nfsd_wait_for_delegreturn(rqstp, d_inode(rd.old_dentry))) break; } if (!host_err) { @@ -1983,11 +1966,6 @@ retry: host_err = commit_metadata(ffhp); } } - out_dput_new: - dput(ndentry); - out_dput_old: - dput(odentry); - out_nfserr: if (host_err == -EBUSY) { /* * See RFC 8881 Section 18.26.4 para 1-3: NFSv4 RENAME @@ -2006,7 +1984,7 @@ retry: fh_fill_post_attrs(tfhp); } out_unlock: - unlock_rename(tdentry, fdentry); + end_renaming(&rd); out_want_write: fh_drop_write(ffhp); @@ -2017,9 +1995,8 @@ out_want_write: * until this point and then reattempt the whole shebang. */ if (close_cached) { - close_cached = false; - nfsd_close_cached_files(ndentry); - dput(ndentry); + nfsd_close_cached_files(close_cached); + dput(close_cached); goto retry; } out: diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 82b6ff0ab2d3..18a4c7a5ddd2 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1124,9 +1124,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, int err; struct dentry *old_upperdir; struct dentry *new_upperdir; - struct dentry *olddentry = NULL; - struct dentry *newdentry = NULL; - struct dentry *trap, *de; + struct renamedata rd = {}; bool old_opaque; bool new_opaque; bool cleanup_whiteout = false; @@ -1136,6 +1134,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, bool new_is_dir = d_is_dir(new); bool samedir = olddir == newdir; struct dentry *opaquedir = NULL; + struct dentry *whiteout = NULL; const struct cred *old_cred = NULL; struct ovl_fs *ofs = OVL_FS(old->d_sb); LIST_HEAD(list); @@ -1233,29 +1232,21 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } } - trap = lock_rename(new_upperdir, old_upperdir); - if (IS_ERR(trap)) { - err = PTR_ERR(trap); - goto out_revert_creds; - } + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = old_upperdir; + rd.new_parent = new_upperdir; + rd.flags = flags; - de = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir, - old->d_name.len); - err = PTR_ERR(de); - if (IS_ERR(de)) - goto out_unlock; - olddentry = de; + err = start_renaming(&rd, 0, + &QSTR_LEN(old->d_name.name, old->d_name.len), + &QSTR_LEN(new->d_name.name, new->d_name.len)); - err = -ESTALE; - if (!ovl_matches_upper(old, olddentry)) - goto out_unlock; + if (err) + goto out_revert_creds; - de = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir, - new->d_name.len); - err = PTR_ERR(de); - if (IS_ERR(de)) + err = -ESTALE; + if (!ovl_matches_upper(old, rd.old_dentry)) goto out_unlock; - newdentry = de; old_opaque = ovl_dentry_is_opaque(old); new_opaque = ovl_dentry_is_opaque(new); @@ -1263,15 +1254,15 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, err = -ESTALE; if (d_inode(new) && ovl_dentry_upper(new)) { if (opaquedir) { - if (newdentry != opaquedir) + if (rd.new_dentry != opaquedir) goto out_unlock; } else { - if (!ovl_matches_upper(new, newdentry)) + if (!ovl_matches_upper(new, rd.new_dentry)) goto out_unlock; } } else { - if (!d_is_negative(newdentry)) { - if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry)) + if (!d_is_negative(rd.new_dentry)) { + if (!new_opaque || !ovl_upper_is_whiteout(ofs, rd.new_dentry)) goto out_unlock; } else { if (flags & RENAME_EXCHANGE) @@ -1279,19 +1270,14 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } } - if (olddentry == trap) - goto out_unlock; - if (newdentry == trap) - goto out_unlock; - - if (olddentry->d_inode == newdentry->d_inode) + if (rd.old_dentry->d_inode == rd.new_dentry->d_inode) goto out_unlock; err = 0; if (ovl_type_merge_or_lower(old)) err = ovl_set_redirect(old, samedir); else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent)) - err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); + err = ovl_set_opaque_xerr(old, rd.old_dentry, -EXDEV); if (err) goto out_unlock; @@ -1299,18 +1285,24 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, err = ovl_set_redirect(new, samedir); else if (!overwrite && new_is_dir && !new_opaque && ovl_type_merge(old->d_parent)) - err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); + err = ovl_set_opaque_xerr(new, rd.new_dentry, -EXDEV); if (err) goto out_unlock; - err = ovl_do_rename(ofs, old_upperdir, olddentry, - new_upperdir, newdentry, flags); - unlock_rename(new_upperdir, old_upperdir); + err = ovl_do_rename_rd(&rd); + + if (!err && cleanup_whiteout) + whiteout = dget(rd.new_dentry); + + end_renaming(&rd); + if (err) goto out_revert_creds; - if (cleanup_whiteout) - ovl_cleanup(ofs, old_upperdir, newdentry); + if (whiteout) { + ovl_cleanup(ofs, old_upperdir, whiteout); + dput(whiteout); + } if (overwrite && d_inode(new)) { if (new_is_dir) @@ -1336,14 +1328,12 @@ out_revert_creds: else ovl_drop_write(old); out: - dput(newdentry); - dput(olddentry); dput(opaquedir); ovl_cache_free(&list); return err; out_unlock: - unlock_rename(new_upperdir, old_upperdir); + end_renaming(&rd); goto out_revert_creds; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 49ad65f829dc..3cc85a893b5c 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -355,11 +355,24 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } +static inline int ovl_do_rename_rd(struct renamedata *rd) +{ + int err; + + pr_debug("rename(%pd2, %pd2, 0x%x)\n", rd->old_dentry, rd->new_dentry, + rd->flags); + err = vfs_rename(rd); + if (err) { + pr_debug("...rename(%pd2, %pd2, ...) = %i\n", + rd->old_dentry, rd->new_dentry, err); + } + return err; +} + static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, struct dentry *olddentry, struct dentry *newdir, struct dentry *newdentry, unsigned int flags) { - int err; struct renamedata rd = { .mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_parent = olddir, @@ -369,13 +382,7 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, .flags = flags, }; - pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); - err = vfs_rename(&rd); - if (err) { - pr_debug("...rename(%pd2, %pd2, ...) = %i\n", - olddentry, newdentry, err); - } - return err; + return ovl_do_rename_rd(&rd); } static inline int ovl_do_whiteout(struct ovl_fs *ofs, diff --git a/include/linux/namei.h b/include/linux/namei.h index e5cff89679df..19c3d8e336d5 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -156,6 +156,9 @@ extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); +int start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last); +void end_renaming(struct renamedata *rd); /** * mode_strip_umask - handle vfs umask stripping -- cgit v1.2.3 From ac50950ca143fd637dec4f7457a9162e1a4344e8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:34 +1100 Subject: VFS/ovl/smb: introduce start_renaming_dentry() Several callers perform a rename on a dentry they already have, and only require lookup for the target name. This includes smb/server and a few different places in overlayfs. start_renaming_dentry() performs the required lookup and takes the required lock using lock_rename_child() It is used in three places in overlayfs and in ksmbd_vfs_rename(). In the ksmbd case, the parent of the source is not important - the source must be renamed from wherever it is. So start_renaming_dentry() allows rd->old_parent to be NULL and only checks it if it is non-NULL. On success rd->old_parent will be the parent of old_dentry with an extra reference taken. Other start_renaming function also now take the extra reference and end_renaming() now drops this reference as well. ovl_lookup_temp(), ovl_parent_lock(), and ovl_parent_unlock() are all removed as they are no longer needed. OVL_TEMPNAME_SIZE and ovl_tempname() are now declared in overlayfs.h so that ovl_check_rename_whiteout() can access them. ovl_copy_up_workdir() now always cleans up on error. Reviewed-by: Namjae Jeon Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-12-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 108 ++++++++++++++++++++++++++++++++++++++++++++--- fs/overlayfs/copy_up.c | 54 ++++++++++-------------- fs/overlayfs/dir.c | 19 +-------- fs/overlayfs/overlayfs.h | 8 +--- fs/overlayfs/super.c | 22 +++++----- fs/overlayfs/util.c | 11 ----- fs/smb/server/vfs.c | 60 +++++--------------------- include/linux/namei.h | 2 + 8 files changed, 150 insertions(+), 134 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 0ee0a110b088..5153ceddd37a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3669,7 +3669,7 @@ EXPORT_SYMBOL(unlock_rename); /** * __start_renaming - lookup and lock names for rename - * @rd: rename data containing parent and flags, and + * @rd: rename data containing parents and flags, and * for receiving found dentries * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, * LOOKUP_NO_SYMLINKS etc). @@ -3680,8 +3680,8 @@ EXPORT_SYMBOL(unlock_rename); * rename. * * On success the found dentries are stored in @rd.old_dentry, - * @rd.new_dentry. These references and the lock are dropped by - * end_renaming(). + * @rd.new_dentry and an extra ref is taken on @rd.old_parent. + * These references and the lock are dropped by end_renaming(). * * The passed in qstrs must have the hash calculated, and no permission * checking is performed. @@ -3735,6 +3735,7 @@ __start_renaming(struct renamedata *rd, int lookup_flags, rd->old_dentry = d1; rd->new_dentry = d2; + dget(rd->old_parent); return 0; out_dput_d2: @@ -3748,7 +3749,7 @@ out_unlock: /** * start_renaming - lookup and lock names for rename with permission checking - * @rd: rename data containing parent and flags, and + * @rd: rename data containing parents and flags, and * for receiving found dentries * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, * LOOKUP_NO_SYMLINKS etc). @@ -3759,8 +3760,8 @@ out_unlock: * rename. * * On success the found dentries are stored in @rd.old_dentry, - * @rd.new_dentry. These references and the lock are dropped by - * end_renaming(). + * @rd.new_dentry. Also the refcount on @rd->old_parent is increased. + * These references and the lock are dropped by end_renaming(). * * The passed in qstrs need not have the hash calculated, and basic * eXecute permission checking is performed against @rd.mnt_idmap. @@ -3782,11 +3783,106 @@ int start_renaming(struct renamedata *rd, int lookup_flags, } EXPORT_SYMBOL(start_renaming); +static int +__start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last) +{ + struct dentry *trap; + struct dentry *d2; + int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + int err; + + if (rd->flags & RENAME_EXCHANGE) + target_flags = 0; + if (rd->flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; + + /* Already have the dentry - need to be sure to lock the correct parent */ + trap = lock_rename_child(old_dentry, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + if (d_unhashed(old_dentry) || + (rd->old_parent && rd->old_parent != old_dentry->d_parent)) { + /* dentry was removed, or moved and explicit parent requested */ + err = -EINVAL; + goto out_unlock; + } + + d2 = lookup_one_qstr_excl(new_last, rd->new_parent, + lookup_flags | target_flags); + err = PTR_ERR(d2); + if (IS_ERR(d2)) + goto out_unlock; + + if (old_dentry == trap) { + /* source is an ancestor of target */ + err = -EINVAL; + goto out_dput_d2; + } + + if (d2 == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_dput_d2; + } + + rd->old_dentry = dget(old_dentry); + rd->new_dentry = d2; + rd->old_parent = dget(old_dentry->d_parent); + return 0; + +out_dput_d2: + dput(d2); +out_unlock: + unlock_rename(old_dentry->d_parent, rd->new_parent); + return err; +} + +/** + * start_renaming_dentry - lookup and lock name for rename with permission checking + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_dentry: dentry of name to move + * @new_last: name of target in @rd.new_parent + * + * Look up target name and ensure locks are in place for + * rename. + * + * On success the found dentry is stored in @rd.new_dentry and + * @rd.old_parent is confirmed to be the parent of @old_dentry. If it + * was originally %NULL, it is set. In either case a reference is taken + * so that end_renaming() can have a stable reference to unlock. + * + * References and the lock can be dropped with end_renaming() + * + * The passed in qstr need not have the hash calculated, and basic + * eXecute permission checking is performed against @rd.mnt_idmap. + * + * Returns: zero or an error. + */ +int start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last) +{ + int err; + + err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); + if (err) + return err; + return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last); +} +EXPORT_SYMBOL(start_renaming_dentry); + void end_renaming(struct renamedata *rd) { unlock_rename(rd->old_parent, rd->new_parent); dput(rd->old_dentry); dput(rd->new_dentry); + dput(rd->old_parent); } EXPORT_SYMBOL(end_renaming); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 7a31ca9bdea2..27014ada11c7 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -523,8 +523,8 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *indexdir = ovl_indexdir(dentry->d_sb); - struct dentry *index = NULL; struct dentry *temp = NULL; + struct renamedata rd = {}; struct qstr name = { }; int err; @@ -556,17 +556,15 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, if (err) goto out; - err = ovl_parent_lock(indexdir, temp); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = indexdir; + rd.new_parent = indexdir; + err = start_renaming_dentry(&rd, 0, temp, &name); if (err) goto out; - index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); - if (IS_ERR(index)) { - err = PTR_ERR(index); - } else { - err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0); - dput(index); - } - ovl_parent_unlock(indexdir); + + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); out: if (err) ovl_cleanup(ofs, indexdir, temp); @@ -763,7 +761,8 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; struct path path = { .mnt = ovl_upper_mnt(ofs) }; - struct dentry *temp, *upper, *trap; + struct renamedata rd = {}; + struct dentry *temp; struct ovl_cu_creds cc; int err; struct ovl_cattr cattr = { @@ -807,29 +806,24 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) * ovl_copy_up_data(), so lock workdir and destdir and make sure that * temp wasn't moved before copy up completion or cleanup. */ - trap = lock_rename(c->workdir, c->destdir); - if (trap || temp->d_parent != c->workdir) { - /* temp or workdir moved underneath us? abort without cleanup */ - dput(temp); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = c->workdir; + rd.new_parent = c->destdir; + rd.flags = 0; + err = start_renaming_dentry(&rd, 0, temp, + &QSTR_LEN(c->destname.name, c->destname.len)); + if (err) { + /* temp or workdir moved underneath us? map to -EIO */ err = -EIO; - if (!IS_ERR(trap)) - unlock_rename(c->workdir, c->destdir); - goto out; } - - err = ovl_copy_up_metadata(c, temp); if (err) - goto cleanup; + goto cleanup_unlocked; - upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, - c->destname.len); - err = PTR_ERR(upper); - if (IS_ERR(upper)) - goto cleanup; + err = ovl_copy_up_metadata(c, temp); + if (!err) + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); - err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0); - unlock_rename(c->workdir, c->destdir); - dput(upper); if (err) goto cleanup_unlocked; @@ -850,8 +844,6 @@ out: return err; -cleanup: - unlock_rename(c->workdir, c->destdir); cleanup_unlocked: ovl_cleanup(ofs, c->workdir, temp); dput(temp); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 18a4c7a5ddd2..ac5b4475533e 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -57,8 +57,7 @@ int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, return 0; } -#define OVL_TEMPNAME_SIZE 20 -static void ovl_tempname(char name[OVL_TEMPNAME_SIZE]) +void ovl_tempname(char name[OVL_TEMPNAME_SIZE]) { static atomic_t temp_id = ATOMIC_INIT(0); @@ -66,22 +65,6 @@ static void ovl_tempname(char name[OVL_TEMPNAME_SIZE]) snprintf(name, OVL_TEMPNAME_SIZE, "#%x", atomic_inc_return(&temp_id)); } -struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) -{ - struct dentry *temp; - char name[OVL_TEMPNAME_SIZE]; - - ovl_tempname(name); - temp = ovl_lookup_upper(ofs, name, workdir, strlen(name)); - if (!IS_ERR(temp) && temp->d_inode) { - pr_err("workdir/%s already exists\n", name); - dput(temp); - temp = ERR_PTR(-EIO); - } - - return temp; -} - static struct dentry *ovl_start_creating_temp(struct ovl_fs *ofs, struct dentry *workdir) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 3cc85a893b5c..746bc4ad7b37 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -447,11 +447,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags) } /* util.c */ -int ovl_parent_lock(struct dentry *parent, struct dentry *child); -static inline void ovl_parent_unlock(struct dentry *parent) -{ - inode_unlock(parent->d_inode); -} int ovl_get_write_access(struct dentry *dentry); void ovl_put_write_access(struct dentry *dentry); void ovl_start_write(struct dentry *dentry); @@ -888,7 +883,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, struct dentry *newdentry, struct ovl_cattr *attr); int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *dentry); -struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir); +#define OVL_TEMPNAME_SIZE 20 +void ovl_tempname(char name[OVL_TEMPNAME_SIZE]); struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 6e0816c1147a..a721ef2b90e8 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -566,9 +566,10 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) { struct dentry *workdir = ofs->workdir; struct dentry *temp; - struct dentry *dest; struct dentry *whiteout; struct name_snapshot name; + struct renamedata rd = {}; + char name2[OVL_TEMPNAME_SIZE]; int err; temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0)); @@ -576,23 +577,21 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) if (IS_ERR(temp)) return err; - err = ovl_parent_lock(workdir, temp); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = workdir; + rd.new_parent = workdir; + rd.flags = RENAME_WHITEOUT; + ovl_tempname(name2); + err = start_renaming_dentry(&rd, 0, temp, &QSTR(name2)); if (err) { dput(temp); return err; } - dest = ovl_lookup_temp(ofs, workdir); - err = PTR_ERR(dest); - if (IS_ERR(dest)) { - dput(temp); - ovl_parent_unlock(workdir); - return err; - } /* Name is inline and stable - using snapshot as a copy helper */ take_dentry_name_snapshot(&name, temp); - err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT); - ovl_parent_unlock(workdir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) { if (err == -EINVAL) err = 0; @@ -616,7 +615,6 @@ cleanup_temp: ovl_cleanup(ofs, workdir, temp); release_dentry_name_snapshot(&name); dput(temp); - dput(dest); return err; } diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f76672f2e686..46387aeb6be6 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1548,14 +1548,3 @@ void ovl_copyattr(struct inode *inode) i_size_write(inode, i_size_read(realinode)); spin_unlock(&inode->i_lock); } - -int ovl_parent_lock(struct dentry *parent, struct dentry *child) -{ - inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); - if (!child || - (!d_unhashed(child) && child->d_parent == parent)) - return 0; - - inode_unlock(parent->d_inode); - return -EINVAL; -} diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 148c65d59e42..ea5ab5b0adb1 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -661,7 +661,6 @@ out1: int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, char *newname, int flags) { - struct dentry *old_parent, *new_dentry, *trap; struct dentry *old_child = old_path->dentry; struct path new_path; struct qstr new_last; @@ -671,7 +670,6 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, struct ksmbd_file *parent_fp; int new_type; int err, lookup_flags = LOOKUP_NO_SYMLINKS; - int target_lookup_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; if (ksmbd_override_fsids(work)) return -ENOMEM; @@ -682,14 +680,6 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, goto revert_fsids; } - /* - * explicitly handle file overwrite case, for compatibility with - * filesystems that may not support rename flags (e.g: fuse) - */ - if (flags & RENAME_NOREPLACE) - target_lookup_flags |= LOOKUP_EXCL; - flags &= ~(RENAME_NOREPLACE); - retry: err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH, &new_path, &new_last, &new_type, @@ -706,17 +696,14 @@ retry: if (err) goto out2; - trap = lock_rename_child(old_child, new_path.dentry); - if (IS_ERR(trap)) { - err = PTR_ERR(trap); + rd.mnt_idmap = mnt_idmap(old_path->mnt); + rd.old_parent = NULL; + rd.new_parent = new_path.dentry; + rd.flags = flags; + rd.delegated_inode = NULL, + err = start_renaming_dentry(&rd, lookup_flags, old_child, &new_last); + if (err) goto out_drop_write; - } - - old_parent = dget(old_child->d_parent); - if (d_unhashed(old_child)) { - err = -EINVAL; - goto out3; - } parent_fp = ksmbd_lookup_fd_inode(old_child->d_parent); if (parent_fp) { @@ -729,44 +716,17 @@ retry: ksmbd_fd_put(work, parent_fp); } - new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, - lookup_flags | target_lookup_flags); - if (IS_ERR(new_dentry)) { - err = PTR_ERR(new_dentry); - goto out3; - } - - if (d_is_symlink(new_dentry)) { + if (d_is_symlink(rd.new_dentry)) { err = -EACCES; - goto out4; - } - - if (old_child == trap) { - err = -EINVAL; - goto out4; - } - - if (new_dentry == trap) { - err = -ENOTEMPTY; - goto out4; + goto out3; } - rd.mnt_idmap = mnt_idmap(old_path->mnt), - rd.old_parent = old_parent, - rd.old_dentry = old_child, - rd.new_parent = new_path.dentry, - rd.new_dentry = new_dentry, - rd.flags = flags, - rd.delegated_inode = NULL, err = vfs_rename(&rd); if (err) ksmbd_debug(VFS, "vfs_rename failed err %d\n", err); -out4: - dput(new_dentry); out3: - dput(old_parent); - unlock_rename(old_parent, new_path.dentry); + end_renaming(&rd); out_drop_write: mnt_drop_write(old_path->mnt); out2: diff --git a/include/linux/namei.h b/include/linux/namei.h index 19c3d8e336d5..f73001e3719a 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -158,6 +158,8 @@ extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); int start_renaming(struct renamedata *rd, int lookup_flags, struct qstr *old_last, struct qstr *new_last); +int start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last); void end_renaming(struct renamedata *rd); /** -- cgit v1.2.3 From 833d2b3a072f7ff6005bf84c065c7cbda81a8aaa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:35 +1100 Subject: Add start_renaming_two_dentries() A few callers want to lock for a rename and already have both dentries. Also debugfs does want to perform a lookup but doesn't want permission checking, so start_renaming_dentry() cannot be used. This patch introduces start_renaming_two_dentries() which is given both dentries. debugfs performs one lookup itself. As it will only continue with a negative dentry and as those cannot be renamed or unlinked, it is safe to do the lookup before getting the rename locks. overlayfs uses start_renaming_two_dentries() in three places and selinux uses it twice in sel_make_policy_nodes(). In sel_make_policy_nodes() we now lock for rename twice instead of just once so the combined operation is no longer atomic w.r.t the parent directory locks. As selinux_state.policy_mutex is held across the whole operation this does not open up any interesting races. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-13-neilb@ownmail.net Signed-off-by: Christian Brauner --- fs/debugfs/inode.c | 48 +++++++++++++++----------------- fs/namei.c | 65 ++++++++++++++++++++++++++++++++++++++++++++ fs/overlayfs/dir.c | 43 +++++++++++++++++++---------- include/linux/namei.h | 2 ++ security/selinux/selinuxfs.c | 15 ++++++++-- 5 files changed, 131 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index f241b9df642a..532bd7c46baf 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -842,7 +842,8 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, . int error = 0; const char *new_name; struct name_snapshot old_name; - struct dentry *parent, *target; + struct dentry *target; + struct renamedata rd = {}; struct inode *dir; va_list ap; @@ -855,36 +856,31 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, . if (!new_name) return -ENOMEM; - parent = dget_parent(dentry); - dir = d_inode(parent); - inode_lock(dir); + rd.old_parent = dget_parent(dentry); + rd.new_parent = rd.old_parent; + rd.flags = RENAME_NOREPLACE; + target = lookup_noperm_unlocked(&QSTR(new_name), rd.new_parent); + if (IS_ERR(target)) + return PTR_ERR(target); - take_dentry_name_snapshot(&old_name, dentry); - - if (WARN_ON_ONCE(dentry->d_parent != parent)) { - error = -EINVAL; - goto out; - } - if (strcmp(old_name.name.name, new_name) == 0) - goto out; - target = lookup_noperm(&QSTR(new_name), parent); - if (IS_ERR(target)) { - error = PTR_ERR(target); - goto out; - } - if (d_really_is_positive(target)) { - dput(target); - error = -EINVAL; + error = start_renaming_two_dentries(&rd, dentry, target); + if (error) { + if (error == -EEXIST && target == dentry) + /* it isn't an error to rename a thing to itself */ + error = 0; goto out; } - simple_rename_timestamp(dir, dentry, dir, target); - d_move(dentry, target); - dput(target); + + dir = d_inode(rd.old_parent); + take_dentry_name_snapshot(&old_name, dentry); + simple_rename_timestamp(dir, dentry, dir, rd.new_dentry); + d_move(dentry, rd.new_dentry); fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry); -out: release_dentry_name_snapshot(&old_name); - inode_unlock(dir); - dput(parent); + end_renaming(&rd); +out: + dput(rd.old_parent); + dput(target); kfree_const(new_name); return error; } diff --git a/fs/namei.c b/fs/namei.c index 5153ceddd37a..4a4b8b96c192 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3877,6 +3877,71 @@ int start_renaming_dentry(struct renamedata *rd, int lookup_flags, } EXPORT_SYMBOL(start_renaming_dentry); +/** + * start_renaming_two_dentries - Lock to dentries in given parents for rename + * @rd: rename data containing parent + * @old_dentry: dentry of name to move + * @new_dentry: dentry to move to + * + * Ensure locks are in place for rename and check parentage is still correct. + * + * On success the two dentries are stored in @rd.old_dentry and + * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to + * be the parents of the dentries. + * + * References and the lock can be dropped with end_renaming() + * + * Returns: zero or an error. + */ +int +start_renaming_two_dentries(struct renamedata *rd, + struct dentry *old_dentry, struct dentry *new_dentry) +{ + struct dentry *trap; + int err; + + /* Already have the dentry - need to be sure to lock the correct parent */ + trap = lock_rename_child(old_dentry, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + err = -EINVAL; + if (d_unhashed(old_dentry) || + (rd->old_parent && rd->old_parent != old_dentry->d_parent)) + /* old_dentry was removed, or moved and explicit parent requested */ + goto out_unlock; + if (d_unhashed(new_dentry) || + rd->new_parent != new_dentry->d_parent) + /* new_dentry was removed or moved */ + goto out_unlock; + + if (old_dentry == trap) + /* source is an ancestor of target */ + goto out_unlock; + + if (new_dentry == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_unlock; + } + + err = -EEXIST; + if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE)) + goto out_unlock; + + rd->old_dentry = dget(old_dentry); + rd->new_dentry = dget(new_dentry); + rd->old_parent = dget(old_dentry->d_parent); + return 0; + +out_unlock: + unlock_rename(old_dentry->d_parent, rd->new_parent); + return err; +} +EXPORT_SYMBOL(start_renaming_two_dentries); + void end_renaming(struct renamedata *rd) { unlock_rename(rd->old_parent, rd->new_parent); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index ac5b4475533e..b7f443932d93 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -123,6 +123,7 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, struct dentry *dentry) { struct dentry *whiteout; + struct renamedata rd = {}; int err; int flags = 0; @@ -134,10 +135,14 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, if (d_is_dir(dentry)) flags = RENAME_EXCHANGE; - err = ovl_lock_rename_workdir(ofs->workdir, whiteout, dir, dentry); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = ofs->workdir; + rd.new_parent = dir; + rd.flags = flags; + err = start_renaming_two_dentries(&rd, whiteout, dentry); if (!err) { - err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags); - unlock_rename(ofs->workdir, dir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); } if (err) goto kill_whiteout; @@ -388,6 +393,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct renamedata rd = {}; struct path upperpath; struct dentry *upper; struct dentry *opaquedir; @@ -413,7 +419,11 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (IS_ERR(opaquedir)) goto out; - err = ovl_lock_rename_workdir(workdir, opaquedir, upperdir, upper); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = workdir; + rd.new_parent = upperdir; + rd.flags = RENAME_EXCHANGE; + err = start_renaming_two_dentries(&rd, opaquedir, upper); if (err) goto out_cleanup_unlocked; @@ -431,8 +441,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE); - unlock_rename(workdir, upperdir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) goto out_cleanup_unlocked; @@ -445,7 +455,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, return opaquedir; out_cleanup: - unlock_rename(workdir, upperdir); + end_renaming(&rd); out_cleanup_unlocked: ovl_cleanup(ofs, workdir, opaquedir); dput(opaquedir); @@ -468,6 +478,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct renamedata rd = {}; struct dentry *upper; struct dentry *newdentry; int err; @@ -499,7 +510,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (IS_ERR(newdentry)) goto out_dput; - err = ovl_lock_rename_workdir(workdir, newdentry, upperdir, upper); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = workdir; + rd.new_parent = upperdir; + rd.flags = 0; + err = start_renaming_two_dentries(&rd, newdentry, upper); if (err) goto out_cleanup_unlocked; @@ -536,16 +551,16 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, - RENAME_EXCHANGE); - unlock_rename(workdir, upperdir); + rd.flags = RENAME_EXCHANGE; + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) goto out_cleanup_unlocked; ovl_cleanup(ofs, workdir, upper); } else { - err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0); - unlock_rename(workdir, upperdir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) goto out_cleanup_unlocked; } @@ -565,7 +580,7 @@ out: return err; out_cleanup: - unlock_rename(workdir, upperdir); + end_renaming(&rd); out_cleanup_unlocked: ovl_cleanup(ofs, workdir, newdentry); dput(newdentry); diff --git a/include/linux/namei.h b/include/linux/namei.h index f73001e3719a..a99ac8b7e24a 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -160,6 +160,8 @@ int start_renaming(struct renamedata *rd, int lookup_flags, struct qstr *old_last, struct qstr *new_last); int start_renaming_dentry(struct renamedata *rd, int lookup_flags, struct dentry *old_dentry, struct qstr *new_last); +int start_renaming_two_dentries(struct renamedata *rd, + struct dentry *old_dentry, struct dentry *new_dentry); void end_renaming(struct renamedata *rd); /** diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 232e087bce3e..404e08bf60ba 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -506,6 +506,7 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi, { int ret = 0; struct dentry *tmp_parent, *tmp_bool_dir, *tmp_class_dir; + struct renamedata rd = {}; unsigned int bool_num = 0; char **bool_names = NULL; int *bool_values = NULL; @@ -539,9 +540,14 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi, if (ret) goto out; - lock_rename(tmp_parent, fsi->sb->s_root); + rd.old_parent = tmp_parent; + rd.new_parent = fsi->sb->s_root; /* booleans */ + ret = start_renaming_two_dentries(&rd, tmp_bool_dir, fsi->bool_dir); + if (ret) + goto out; + d_exchange(tmp_bool_dir, fsi->bool_dir); swap(fsi->bool_num, bool_num); @@ -549,12 +555,17 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi, swap(fsi->bool_pending_values, bool_values); fsi->bool_dir = tmp_bool_dir; + end_renaming(&rd); /* classes */ + ret = start_renaming_two_dentries(&rd, tmp_class_dir, fsi->class_dir); + if (ret) + goto out; + d_exchange(tmp_class_dir, fsi->class_dir); fsi->class_dir = tmp_class_dir; - unlock_rename(tmp_parent, fsi->sb->s_root); + end_renaming(&rd); out: sel_remove_old_bool_data(bool_num, bool_names, bool_values); -- cgit v1.2.3 From f046fbb4d81d1b0c4a169707411e3cd540c03354 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:36 +1100 Subject: ecryptfs: use new start_creating/start_removing APIs This requires the addition of start_creating_dentry() which is given the dentry which has already been found, and asks for it to be locked and its parent validated. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-14-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/ecryptfs/inode.c | 153 ++++++++++++++++++++++++-------------------------- fs/namei.c | 33 +++++++++++ include/linux/namei.h | 2 + 3 files changed, 107 insertions(+), 81 deletions(-) (limited to 'include/linux') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ed1394da8d6b..6a5bca89e752 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -24,18 +24,26 @@ #include #include "ecryptfs_kernel.h" -static int lock_parent(struct dentry *dentry, - struct dentry **lower_dentry, - struct inode **lower_dir) +static struct dentry *ecryptfs_start_creating_dentry(struct dentry *dentry) { - struct dentry *lower_dir_dentry; + struct dentry *parent = dget_parent(dentry); + struct dentry *ret; - lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); - *lower_dir = d_inode(lower_dir_dentry); - *lower_dentry = ecryptfs_dentry_to_lower(dentry); + ret = start_creating_dentry(ecryptfs_dentry_to_lower(parent), + ecryptfs_dentry_to_lower(dentry)); + dput(parent); + return ret; +} - inode_lock_nested(*lower_dir, I_MUTEX_PARENT); - return (*lower_dentry)->d_parent == lower_dir_dentry ? 0 : -EINVAL; +static struct dentry *ecryptfs_start_removing_dentry(struct dentry *dentry) +{ + struct dentry *parent = dget_parent(dentry); + struct dentry *ret; + + ret = start_removing_dentry(ecryptfs_dentry_to_lower(parent), + ecryptfs_dentry_to_lower(dentry)); + dput(parent); + return ret; } static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) @@ -141,15 +149,12 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, struct inode *lower_dir; int rc; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - dget(lower_dentry); // don't even try to make the lower negative - if (!rc) { - if (d_unhashed(lower_dentry)) - rc = -EINVAL; - else - rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, - NULL); - } + lower_dentry = ecryptfs_start_removing_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + + lower_dir = lower_dentry->d_parent->d_inode; + rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; @@ -158,8 +163,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); out_unlock: - dput(lower_dentry); - inode_unlock(lower_dir); + end_removing(lower_dentry); if (!rc) d_drop(dentry); return rc; @@ -186,10 +190,12 @@ ecryptfs_do_create(struct inode *directory_inode, struct inode *lower_dir; struct inode *inode; - rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir); - if (!rc) - rc = vfs_create(&nop_mnt_idmap, lower_dir, - lower_dentry, mode, true); + lower_dentry = ecryptfs_start_creating_dentry(ecryptfs_dentry); + if (IS_ERR(lower_dentry)) + return ERR_CAST(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + rc = vfs_create(&nop_mnt_idmap, lower_dir, + lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); @@ -205,7 +211,7 @@ ecryptfs_do_create(struct inode *directory_inode, fsstack_copy_attr_times(directory_inode, lower_dir); fsstack_copy_inode_size(directory_inode, lower_dir); out_lock: - inode_unlock(lower_dir); + end_creating(lower_dentry, NULL); return inode; } @@ -433,10 +439,12 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, file_size_save = i_size_read(d_inode(old_dentry)); lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); - rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir); - if (!rc) - rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir, - lower_new_dentry, NULL); + lower_new_dentry = ecryptfs_start_creating_dentry(new_dentry); + if (IS_ERR(lower_new_dentry)) + return PTR_ERR(lower_new_dentry); + lower_dir = lower_new_dentry->d_parent->d_inode; + rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir, + lower_new_dentry, NULL); if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); @@ -448,7 +456,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink); i_size_write(d_inode(new_dentry), file_size_save); out_lock: - inode_unlock(lower_dir); + end_creating(lower_new_dentry, NULL); return rc; } @@ -468,9 +476,11 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap, size_t encoded_symlen; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (rc) - goto out_lock; + lower_dentry = ecryptfs_start_creating_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + mount_crypt_stat = &ecryptfs_superblock_to_private( dir->i_sb)->mount_crypt_stat; rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, @@ -490,7 +500,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap, fsstack_copy_attr_times(dir, lower_dir); fsstack_copy_inode_size(dir, lower_dir); out_lock: - inode_unlock(lower_dir); + end_creating(lower_dentry, NULL); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -501,12 +511,14 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, { int rc; struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; struct inode *lower_dir; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (rc) - goto out; - + lower_dentry = ecryptfs_start_creating_dentry(dentry); + if (IS_ERR(lower_dentry)) + return lower_dentry; + lower_dir_dentry = dget(lower_dentry->d_parent); + lower_dir = lower_dir_dentry->d_inode; lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir, lower_dentry, mode); rc = PTR_ERR(lower_dentry); @@ -522,7 +534,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, fsstack_copy_inode_size(dir, lower_dir); set_nlink(dir, lower_dir->i_nlink); out: - inode_unlock(lower_dir); + end_creating(lower_dentry, lower_dir_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return ERR_PTR(rc); @@ -534,21 +546,18 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) struct inode *lower_dir; int rc; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - dget(lower_dentry); // don't even try to make the lower negative - if (!rc) { - if (d_unhashed(lower_dentry)) - rc = -EINVAL; - else - rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry); - } + lower_dentry = ecryptfs_start_removing_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + + rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry); if (!rc) { clear_nlink(d_inode(dentry)); fsstack_copy_attr_times(dir, lower_dir); set_nlink(dir, lower_dir->i_nlink); } - dput(lower_dentry); - inode_unlock(lower_dir); + end_removing(lower_dentry); if (!rc) d_drop(dentry); return rc; @@ -562,10 +571,12 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *lower_dentry; struct inode *lower_dir; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (!rc) - rc = vfs_mknod(&nop_mnt_idmap, lower_dir, - lower_dentry, mode, dev); + lower_dentry = ecryptfs_start_creating_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + + rc = vfs_mknod(&nop_mnt_idmap, lower_dir, lower_dentry, mode, dev); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); @@ -574,7 +585,7 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, fsstack_copy_attr_times(dir, lower_dir); fsstack_copy_inode_size(dir, lower_dir); out: - inode_unlock(lower_dir); + end_removing(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -590,7 +601,6 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *lower_new_dentry; struct dentry *lower_old_dir_dentry; struct dentry *lower_new_dir_dentry; - struct dentry *trap; struct inode *target_inode; struct renamedata rd = {}; @@ -605,31 +615,13 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, target_inode = d_inode(new_dentry); - trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); - if (IS_ERR(trap)) - return PTR_ERR(trap); - dget(lower_new_dentry); - rc = -EINVAL; - if (lower_old_dentry->d_parent != lower_old_dir_dentry) - goto out_lock; - if (lower_new_dentry->d_parent != lower_new_dir_dentry) - goto out_lock; - if (d_unhashed(lower_old_dentry) || d_unhashed(lower_new_dentry)) - goto out_lock; - /* source should not be ancestor of target */ - if (trap == lower_old_dentry) - goto out_lock; - /* target should not be ancestor of source */ - if (trap == lower_new_dentry) { - rc = -ENOTEMPTY; - goto out_lock; - } + rd.mnt_idmap = &nop_mnt_idmap; + rd.old_parent = lower_old_dir_dentry; + rd.new_parent = lower_new_dir_dentry; + rc = start_renaming_two_dentries(&rd, lower_old_dentry, lower_new_dentry); + if (rc) + return rc; - rd.mnt_idmap = &nop_mnt_idmap; - rd.old_parent = lower_old_dir_dentry; - rd.old_dentry = lower_old_dentry; - rd.new_parent = lower_new_dir_dentry; - rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); if (rc) goto out_lock; @@ -640,8 +632,7 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new_dir != old_dir) fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); out_lock: - dput(lower_new_dentry); - unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + end_renaming(&rd); return rc; } diff --git a/fs/namei.c b/fs/namei.c index 4a4b8b96c192..8b7807cd1343 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3397,6 +3397,39 @@ struct dentry *start_removing_noperm(struct dentry *parent, } EXPORT_SYMBOL(start_removing_noperm); +/** + * start_creating_dentry - prepare to create a given dentry + * @parent: directory from which dentry should be removed + * @child: the dentry to be removed + * + * A lock is taken to protect the dentry again other dirops and + * the validity of the dentry is checked: correct parent and still hashed. + * + * If the dentry is valid and negative a reference is taken and + * returned. If not an error is returned. + * + * end_creating() should be called when creation is complete, or aborted. + * + * Returns: the valid dentry, or an error. + */ +struct dentry *start_creating_dentry(struct dentry *parent, + struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (unlikely(IS_DEADDIR(parent->d_inode) || + child->d_parent != parent || + d_unhashed(child))) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EINVAL); + } + if (d_is_positive(child)) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EEXIST); + } + return dget(child); +} +EXPORT_SYMBOL(start_creating_dentry); + /** * start_removing_dentry - prepare to remove a given dentry * @parent: directory from which dentry should be removed diff --git a/include/linux/namei.h b/include/linux/namei.h index a99ac8b7e24a..208aed1d6728 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -100,6 +100,8 @@ struct dentry *start_removing_killable(struct mnt_idmap *idmap, struct qstr *name); struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); +struct dentry *start_creating_dentry(struct dentry *parent, + struct dentry *child); struct dentry *start_removing_dentry(struct dentry *parent, struct dentry *child); -- cgit v1.2.3 From fe497f0759e0efb949f9480911d00b6045c21f50 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:37 +1100 Subject: VFS: change vfs_mkdir() to unlock on failure. vfs_mkdir() already drops the reference to the dentry on failure but it leaves the parent locked. This complicates end_creating() which needs to unlock the parent even though the dentry is no longer available. If we change vfs_mkdir() to unlock on failure as well as releasing the dentry, we can remove the "parent" arg from end_creating() and simplify the rules for calling it. Note that cachefiles_get_directory() can choose to substitute an error instead of actually calling vfs_mkdir(), for fault injection. In that case it needs to call end_creating(), just as vfs_mkdir() now does on error. ovl_create_real() will now unlock on error. So the conditional end_creating() after the call is removed, and end_creating() is called internally on error. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-15-neilb@ownmail.net Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 13 +++++++++++++ fs/btrfs/ioctl.c | 2 +- fs/cachefiles/namei.c | 16 +++++++++------- fs/ecryptfs/inode.c | 8 ++++---- fs/namei.c | 4 ++-- fs/nfsd/nfs3proc.c | 2 +- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfs4recover.c | 2 +- fs/nfsd/nfsproc.c | 2 +- fs/nfsd/vfs.c | 8 ++++---- fs/overlayfs/copy_up.c | 4 ++-- fs/overlayfs/dir.c | 18 ++++++++---------- fs/overlayfs/super.c | 6 +++--- fs/xfs/scrub/orphanage.c | 2 +- include/linux/namei.h | 28 +++++++++------------------- ipc/mqueue.c | 2 +- 16 files changed, 61 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 7233b04668fc..76ff738a00f3 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1309,3 +1309,16 @@ a different length, use vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len)) instead. + +--- + +**mandatory** + +vfs_mkdir() now returns a dentry - the one returned by ->mkdir(). If +that dentry is different from the dentry passed in, including if it is +an IS_ERR() dentry pointer, the original dentry is dput(). + +When vfs_mkdir() returns an error, and so both dputs() the original +dentry and doesn't provide a replacement, it also unlocks the parent. +Consequently the return value from vfs_mkdir() can be passed to +end_creating() and the parent will be unlocked precisely when necessary. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4fbfdd8faf6a..90ef777eae25 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -935,7 +935,7 @@ static noinline int btrfs_mksubvol(struct dentry *parent, out_up_read: up_read(&fs_info->subvol_sem); out_dput: - end_creating(dentry, parent); + end_creating(dentry); return ret; } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 0104ac00485d..59327618ac42 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -128,10 +128,12 @@ retry: if (ret < 0) goto mkdir_error; ret = cachefiles_inject_write_error(); - if (ret == 0) + if (ret == 0) { subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); - else + } else { + end_creating(subdir); subdir = ERR_PTR(ret); + } if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); @@ -140,7 +142,7 @@ retry: trace_cachefiles_mkdir(dir, subdir); if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) { - end_creating(subdir, dir); + end_creating(subdir); goto retry; } ASSERT(d_backing_inode(subdir)); @@ -154,7 +156,7 @@ retry: /* Tell rmdir() it's not allowed to delete the subdir */ inode_lock(d_inode(subdir)); dget(subdir); - end_creating(subdir, dir); + end_creating(subdir); if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", @@ -196,7 +198,7 @@ mark_error: return ERR_PTR(-EBUSY); mkdir_error: - end_creating(subdir, dir); + end_creating(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); @@ -699,7 +701,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, if (ret < 0) goto out_end; - end_creating(dentry, fan); + end_creating(dentry); ret = cachefiles_inject_read_error(); if (ret == 0) @@ -733,7 +735,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, } out_end: - end_creating(dentry, fan); + end_creating(dentry); out: _leave(" = %u", success); return success; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 6a5bca89e752..2ad1db2cd2ec 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -211,7 +211,7 @@ ecryptfs_do_create(struct inode *directory_inode, fsstack_copy_attr_times(directory_inode, lower_dir); fsstack_copy_inode_size(directory_inode, lower_dir); out_lock: - end_creating(lower_dentry, NULL); + end_creating(lower_dentry); return inode; } @@ -456,7 +456,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink); i_size_write(d_inode(new_dentry), file_size_save); out_lock: - end_creating(lower_new_dentry, NULL); + end_creating(lower_new_dentry); return rc; } @@ -500,7 +500,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap, fsstack_copy_attr_times(dir, lower_dir); fsstack_copy_inode_size(dir, lower_dir); out_lock: - end_creating(lower_dentry, NULL); + end_creating(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -534,7 +534,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, fsstack_copy_inode_size(dir, lower_dir); set_nlink(dir, lower_dir->i_nlink); out: - end_creating(lower_dentry, lower_dir_dentry); + end_creating(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return ERR_PTR(rc); diff --git a/fs/namei.c b/fs/namei.c index 8b7807cd1343..d284ebae41bf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4832,7 +4832,7 @@ EXPORT_SYMBOL(start_creating_path); */ void end_creating_path(const struct path *path, struct dentry *dentry) { - end_creating(dentry, path->dentry); + end_creating(dentry); mnt_drop_write(path->mnt); path_put(path); } @@ -5034,7 +5034,7 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, return dentry; err: - dput(dentry); + end_creating(dentry); return ERR_PTR(error); } EXPORT_SYMBOL(vfs_mkdir); diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index e2aac0def2cb..6b39e4aff959 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -364,7 +364,7 @@ set_attr: status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs); out: - end_creating(child, parent); + end_creating(child); out_write: fh_drop_write(fhp); return status; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b2c95e8e7c68..524cb07a477c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -376,7 +376,7 @@ set_attr: if (attrs.na_aclerr) open->op_bmval[0] &= ~FATTR4_WORD0_ACL; out: - end_creating(child, parent); + end_creating(child); nfsd_attrs_free(&attrs); out_write: fh_drop_write(fhp); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 3eefaa2202e3..18c08395b273 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -215,7 +215,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (IS_ERR(dentry)) status = PTR_ERR(dentry); out_end: - end_creating(dentry, dir); + end_creating(dentry); out: if (status == 0) { if (nn->in_grace) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index ee1b16e921fd..28f03a6a3cc3 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -421,7 +421,7 @@ nfsd_proc_create(struct svc_rqst *rqstp) } out_unlock: - end_creating(dchild, dirfhp->fh_dentry); + end_creating(dchild); out_write: fh_drop_write(dirfhp); done: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 62109885d4db..6e9a57863904 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1589,7 +1589,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, out: if (!err) fh_fill_post_attrs(fhp); - end_creating(dchild, dentry); + end_creating(dchild); return err; out_nfserr: @@ -1646,7 +1646,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; out_unlock: - end_creating(dchild, dentry); + end_creating(dchild); return err; } @@ -1747,7 +1747,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_create_setattr(rqstp, fhp, resfhp, attrs); fh_fill_post_attrs(fhp); out_unlock: - end_creating(dnew, dentry); + end_creating(dnew); if (!err) err = nfserrno(commit_metadata(fhp)); if (!err) @@ -1824,7 +1824,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); out_unlock: - end_creating(dnew, ddir); + end_creating(dnew); if (!host_err) { host_err = commit_metadata(ffhp); if (!host_err) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 27014ada11c7..36949856ddea 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -624,7 +624,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); ovl_dentry_update_reval(c->dentry, upper); } - end_creating(upper, upperdir); + end_creating(upper); } if (err) goto out; @@ -891,7 +891,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, temp, udir, upper); - end_creating(upper, c->destdir); + end_creating(upper); } if (err) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index b7f443932d93..e097ef4e79d2 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -91,7 +91,7 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) err = ovl_do_whiteout(ofs, wdir, whiteout); if (!err) ofs->whiteout = dget(whiteout); - end_creating(whiteout, workdir); + end_creating(whiteout); if (err) return ERR_PTR(err); } @@ -103,7 +103,7 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) err = ovl_do_link(ofs, ofs->whiteout, wdir, link); if (!err) whiteout = dget(link); - end_creating(link, workdir); + end_creating(link); if (!err) return whiteout;; @@ -187,7 +187,7 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, if (!err && ofs->casefold != ovl_dentry_casefolded(newdentry)) { pr_warn_ratelimited("wrong inherited casefold (%pd2)\n", newdentry); - dput(newdentry); + end_creating(newdentry); err = -EINVAL; } break; @@ -237,8 +237,7 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, } out: if (err) { - if (!IS_ERR(newdentry)) - dput(newdentry); + end_creating(newdentry); return ERR_PTR(err); } return newdentry; @@ -254,7 +253,7 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, ret = ovl_create_real(ofs, workdir, ret, attr); if (!IS_ERR(ret)) dget(ret); - end_creating(ret, workdir); + end_creating(ret); return ret; } @@ -362,12 +361,11 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, if (IS_ERR(newdentry)) return PTR_ERR(newdentry); newdentry = ovl_create_real(ofs, upperdir, newdentry, attr); - if (IS_ERR(newdentry)) { - end_creating(newdentry, upperdir); + if (IS_ERR(newdentry)) return PTR_ERR(newdentry); - } + dget(newdentry); - end_creating(newdentry, upperdir); + end_creating(newdentry); if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) && !ovl_allow_offline_changes(ofs)) { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a721ef2b90e8..3acda985c8a3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -320,7 +320,7 @@ retry: if (work->d_inode) { dget(work); - end_creating(work, ofs->workbasedir); + end_creating(work); if (persist) return work; err = -EEXIST; @@ -338,7 +338,7 @@ retry: work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode); if (!IS_ERR(work)) dget(work); - end_creating(work, ofs->workbasedir); + end_creating(work); err = PTR_ERR(work); if (IS_ERR(work)) goto out_err; @@ -632,7 +632,7 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs, OVL_CATTR(mode)); if (!IS_ERR(child)) dget(child); - end_creating(child, parent); + end_creating(child); } dput(parent); diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index e732605924a1..b77c2b6b6d44 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -199,7 +199,7 @@ xrep_orphanage_create( sc->orphanage_ilock_flags = 0; out_dput_orphanage: - end_creating(orphanage_dentry, root_dentry); + end_creating(orphanage_dentry); out_dput_root: dput(root_dentry); out: diff --git a/include/linux/namei.h b/include/linux/namei.h index 208aed1d6728..0ef73d739a31 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -105,34 +105,24 @@ struct dentry *start_creating_dentry(struct dentry *parent, struct dentry *start_removing_dentry(struct dentry *parent, struct dentry *child); -/** - * end_creating - finish action started with start_creating - * @child: dentry returned by start_creating() or vfs_mkdir() - * @parent: dentry given to start_creating(), - * - * Unlock and release the child. +/* end_creating - finish action started with start_creating + * @child: dentry returned by start_creating() or vfs_mkdir() * - * Unlike end_dirop() this can only be called if start_creating() succeeded. - * It handles @child being and error as vfs_mkdir() might have converted the - * dentry to an error - in that case the parent still needs to be unlocked. + * Unlock and release the child. This can be called after + * start_creating() whether that function succeeded or not, + * but it is not needed on failure. * * If vfs_mkdir() was called then the value returned from that function * should be given for @child rather than the original dentry, as vfs_mkdir() - * may have provided a new dentry. Even if vfs_mkdir() returns an error - * it must be given to end_creating(). + * may have provided a new dentry. + * * * If vfs_mkdir() was not called, then @child will be a valid dentry and * @parent will be ignored. */ -static inline void end_creating(struct dentry *child, struct dentry *parent) +static inline void end_creating(struct dentry *child) { - if (IS_ERR(child)) - /* The parent is still locked despite the error from - * vfs_mkdir() - must unlock it. - */ - inode_unlock(parent->d_inode); - else - end_dirop(child); + end_dirop(child); } /** diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 6d7610310003..83d9466710d6 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -932,7 +932,7 @@ out_putfd: put_unused_fd(fd); fd = error; } - end_creating(path.dentry, root); + end_creating(path.dentry); if (!ro) mnt_drop_write(mnt); out_putname: -- cgit v1.2.3 From cf296b294c3bd8f7db229060efe677dfd49e46b6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:38 +1100 Subject: VFS: introduce end_creating_keep() Occasionally the caller of end_creating() wants to keep using the dentry. Rather then requiring them to dget() the dentry (when not an error) before calling end_creating(), provide end_creating_keep() which does this. cachefiles and overlayfs make use of this. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-16-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/cachefiles/namei.c | 3 +-- fs/overlayfs/dir.c | 8 ++------ fs/overlayfs/super.c | 11 +++-------- include/linux/namei.h | 22 ++++++++++++++++++++++ 4 files changed, 28 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 59327618ac42..ef22ac19545b 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -155,8 +155,7 @@ retry: /* Tell rmdir() it's not allowed to delete the subdir */ inode_lock(d_inode(subdir)); - dget(subdir); - end_creating(subdir); + end_creating_keep(subdir); if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index e097ef4e79d2..10e2b7e41a7a 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -251,10 +251,7 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, if (IS_ERR(ret)) return ret; ret = ovl_create_real(ofs, workdir, ret, attr); - if (!IS_ERR(ret)) - dget(ret); - end_creating(ret); - return ret; + return end_creating_keep(ret); } static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, @@ -364,8 +361,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, if (IS_ERR(newdentry)) return PTR_ERR(newdentry); - dget(newdentry); - end_creating(newdentry); + end_creating_keep(newdentry); if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) && !ovl_allow_offline_changes(ofs)) { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 3acda985c8a3..7b8fc1cab6eb 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -319,8 +319,7 @@ retry: }; if (work->d_inode) { - dget(work); - end_creating(work); + end_creating_keep(work); if (persist) return work; err = -EEXIST; @@ -336,9 +335,7 @@ retry: } work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode); - if (!IS_ERR(work)) - dget(work); - end_creating(work); + end_creating_keep(work); err = PTR_ERR(work); if (IS_ERR(work)) goto out_err; @@ -630,9 +627,7 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs, if (!child->d_inode) child = ovl_create_real(ofs, parent, child, OVL_CATTR(mode)); - if (!IS_ERR(child)) - dget(child); - end_creating(child); + end_creating_keep(child); } dput(parent); diff --git a/include/linux/namei.h b/include/linux/namei.h index 0ef73d739a31..3d82c6a19197 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -125,6 +125,28 @@ static inline void end_creating(struct dentry *child) end_dirop(child); } +/* end_creating_keep - finish action started with start_creating() and return result + * @child: dentry returned by start_creating() or vfs_mkdir() + * + * Unlock and return the child. This can be called after + * start_creating() whether that function succeeded or not, + * but it is not needed on failure. + * + * If vfs_mkdir() was called then the value returned from that function + * should be given for @child rather than the original dentry, as vfs_mkdir() + * may have provided a new dentry. + * + * Returns: @child, which may be a dentry or an error. + * + */ +static inline struct dentry *end_creating_keep(struct dentry *child) +{ + if (!IS_ERR(child)) + dget(child); + end_dirop(child); + return child; +} + /** * end_removing - finish action started with start_removing * @child: dentry returned by start_removing() -- cgit v1.2.3 From c42ba5a87bdccbca11403b7ca8bad1a57b833732 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 10 Nov 2025 10:38:52 +0100 Subject: futex: Store time as ktime_t in restart block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The futex core uses ktime_t to represent times, use that also for the restart block. This allows the simplification of the accessors. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Reviewed-by: Jan Kara Link: https://patch.msgid.link/20251110-restart-block-expiration-v1-2-5d39cc93df4f@linutronix.de --- include/linux/restart_block.h | 2 +- kernel/futex/waitwake.c | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 7e50bbc94e47..4f9316e7590d 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -32,7 +32,7 @@ struct restart_block { u32 val; u32 flags; u32 bitset; - u64 time; + ktime_t time; u32 __user *uaddr2; } futex; /* For nanosleep */ diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index e2bbe5509ec2..1c2dd03f11ec 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -738,12 +738,11 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; - ktime_t t, *tp = NULL; + ktime_t *tp = NULL; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) + tp = &restart->futex.time; - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t = restart->futex.time; - tp = &t; - } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, -- cgit v1.2.3 From 4702f4eceb639b6af199151e352e570943619d98 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 10 Nov 2025 10:38:53 +0100 Subject: hrtimer: Store time as ktime_t in restart block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hrtimer core uses ktime_t to represent times, use that also for the restart block. CPU timers internally use nanoseconds instead of ktime_t but use the same restart block, so use the correct accessors for those. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251110-restart-block-expiration-v1-3-5d39cc93df4f@linutronix.de --- include/linux/restart_block.h | 2 +- kernel/time/hrtimer.c | 4 ++-- kernel/time/posix-cpu-timers.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 7e50bbc94e47..36ddfa1ec301 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -43,7 +43,7 @@ struct restart_block { struct __kernel_timespec __user *rmtp; struct old_timespec32 __user *compat_rmtp; }; - u64 expires; + ktime_t expires; } nanosleep; /* For poll */ struct { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 7e7b2b471bae..9c77e5c72556 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2145,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + hrtimer_set_expires(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; @@ -2172,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, restart = ¤t->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + restart->nanosleep.expires = hrtimer_get_expires(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2e5b89d7d866..0de2bb7cbec0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1557,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; - restart->nanosleep.expires = expires; + restart->nanosleep.expires = ns_to_ktime(expires); if (restart->nanosleep.type != TT_NONE) error = nanosleep_copyout(restart, &it.it_value); } @@ -1599,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) clockid_t which_clock = restart_block->nanosleep.clockid; struct timespec64 t; - t = ns_to_timespec64(restart_block->nanosleep.expires); + t = ktime_to_timespec64(restart_block->nanosleep.expires); return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } -- cgit v1.2.3 From 0ca04993dac9b0d21ffbfd22bf54cc43ec2c49f2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 12 Nov 2025 16:40:23 -0600 Subject: PM: Introduce new PMSG_POWEROFF event PMSG_POWEROFF will be used for the PM core to allow differentiating between a hibernation or shutdown sequence when re-using callbacks for common code. Hibernation is started by writing a hibernation method (such as 'platform' 'shutdown', or 'reboot') to use into /sys/power/disk and writing 'disk' to /sys/power/state. Shutdown is initiated with the reboot() syscall with arguments on whether to halt the system or power it off. Tested-by: Eric Naim Signed-off-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20251112224025.2051702-2-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 5 +++++ include/linux/pm.h | 3 +++ include/trace/events/power.h | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 7a8807ec9a5d..38fc8a978b88 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -96,6 +96,8 @@ static const char *pm_verb(int event) return "restore"; case PM_EVENT_RECOVER: return "recover"; + case PM_EVENT_POWEROFF: + return "poweroff"; default: return "(unknown PM event)"; } @@ -368,6 +370,7 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; + case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: @@ -402,6 +405,7 @@ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; + case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: @@ -436,6 +440,7 @@ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t stat case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; + case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: diff --git a/include/linux/pm.h b/include/linux/pm.h index a72e42eec130..7f69f739f613 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -508,6 +508,7 @@ const struct dev_pm_ops name = { \ * RECOVER Creation of a hibernation image or restoration of the main * memory contents from a hibernation image has failed, call * ->thaw() and ->complete() for all devices. + * POWEROFF System will poweroff, call ->poweroff() for all devices. * * The following PM_EVENT_ messages are defined for internal use by * kernel subsystems. They are never issued by the PM core. @@ -538,6 +539,7 @@ const struct dev_pm_ops name = { \ #define PM_EVENT_USER 0x0100 #define PM_EVENT_REMOTE 0x0200 #define PM_EVENT_AUTO 0x0400 +#define PM_EVENT_POWEROFF 0x0800 #define PM_EVENT_SLEEP (PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE) #define PM_EVENT_USER_SUSPEND (PM_EVENT_USER | PM_EVENT_SUSPEND) @@ -552,6 +554,7 @@ const struct dev_pm_ops name = { \ #define PMSG_QUIESCE ((struct pm_message){ .event = PM_EVENT_QUIESCE, }) #define PMSG_SUSPEND ((struct pm_message){ .event = PM_EVENT_SUSPEND, }) #define PMSG_HIBERNATE ((struct pm_message){ .event = PM_EVENT_HIBERNATE, }) +#define PMSG_POWEROFF ((struct pm_message){ .event = PM_EVENT_POWEROFF, }) #define PMSG_RESUME ((struct pm_message){ .event = PM_EVENT_RESUME, }) #define PMSG_THAW ((struct pm_message){ .event = PM_EVENT_THAW, }) #define PMSG_RESTORE ((struct pm_message){ .event = PM_EVENT_RESTORE, }) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 82904291c2b8..370f8df2fdb4 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -179,7 +179,8 @@ TRACE_EVENT(pstate_sample, { PM_EVENT_HIBERNATE, "hibernate" }, \ { PM_EVENT_THAW, "thaw" }, \ { PM_EVENT_RESTORE, "restore" }, \ - { PM_EVENT_RECOVER, "recover" }) + { PM_EVENT_RECOVER, "recover" }, \ + { PM_EVENT_POWEROFF, "poweroff" }) DEFINE_EVENT(cpu, cpu_frequency, -- cgit v1.2.3 From 4ef92743625818932b9c320152b58274c05e5053 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Nov 2025 12:55:16 +0000 Subject: bpf: Add bpf_prog_run_data_pointers() syzbot found that cls_bpf_classify() is able to change tc_skb_cb(skb)->drop_reason triggering a warning in sk_skb_reason_drop(). WARNING: CPU: 0 PID: 5965 at net/core/skbuff.c:1192 __sk_skb_reason_drop net/core/skbuff.c:1189 [inline] WARNING: CPU: 0 PID: 5965 at net/core/skbuff.c:1192 sk_skb_reason_drop+0x76/0x170 net/core/skbuff.c:1214 struct tc_skb_cb has been added in commit ec624fe740b4 ("net/sched: Extend qdisc control block with tc control block"), which added a wrong interaction with db58ba459202 ("bpf: wire in data and data_end for cls_act_bpf"). drop_reason was added later. Add bpf_prog_run_data_pointers() helper to save/restore the net_sched storage colliding with BPF data_meta/data_end. Fixes: ec624fe740b4 ("net/sched: Extend qdisc control block with tc control block") Reported-by: syzbot Closes: https://lore.kernel.org/netdev/6913437c.a70a0220.22f260.013b.GAE@google.com/ Signed-off-by: Eric Dumazet Signed-off-by: Martin KaFai Lau Reviewed-by: Victor Nogueira Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20251112125516.1563021-1-edumazet@google.com --- include/linux/filter.h | 20 ++++++++++++++++++++ net/sched/act_bpf.c | 6 ++---- net/sched/cls_bpf.c | 6 ++---- 3 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index f5c859b8131a..973233b82dc1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -901,6 +901,26 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb) cb->data_end = skb->data + skb_headlen(skb); } +static inline int bpf_prog_run_data_pointers( + const struct bpf_prog *prog, + struct sk_buff *skb) +{ + struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; + void *save_data_meta, *save_data_end; + int res; + + save_data_meta = cb->data_meta; + save_data_end = cb->data_end; + + bpf_compute_data_pointers(skb); + res = bpf_prog_run(prog, skb); + + cb->data_meta = save_data_meta; + cb->data_end = save_data_end; + + return res; +} + /* Similar to bpf_compute_data_pointers(), except that save orginal * data in cb->data and cb->meta_data for restore. */ diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 396b576390d0..c2b5bc19e091 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -47,12 +47,10 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(filter, skb); + filter_res = bpf_prog_run_data_pointers(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(filter, skb); + filter_res = bpf_prog_run_data_pointers(filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 7fbe42f0e5c2..a32754a2658b 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -97,12 +97,10 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(prog->filter, skb); + filter_res = bpf_prog_run_data_pointers(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(prog->filter, skb); + filter_res = bpf_prog_run_data_pointers(prog->filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; -- cgit v1.2.3 From ce62118a2e4838bcef1050fff55001a0bf87f0cb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:49 -0500 Subject: KVM: SEV: Consolidate the SEV policy bits in a single header file Consolidate SEV policy bit definitions into a single file. Use include/linux/psp-sev.h to hold the definitions and remove the current definitions from the arch/x86/kvm/svm/sev.c and arch/x86/include/svm.h files. No functional change intended. Signed-off-by: Tom Lendacky Link: https://patch.msgid.link/d9639f88a0b521a1a67aeac77cc609fdea1f90bd.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 16 ++++------------ arch/x86/kvm/svm/svm.h | 3 --- include/linux/psp-sev.h | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0835c664fbfd..f04589ae76bb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -65,15 +65,7 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 -/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ -#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) -#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) -#define SNP_POLICY_MASK_SMT BIT_ULL(16) -#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) -#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) -#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) - -#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ +#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ SNP_POLICY_MASK_API_MAJOR | \ SNP_POLICY_MASK_SMT | \ SNP_POLICY_MASK_RSVD_MBO | \ @@ -2207,7 +2199,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.flags) return -EINVAL; - if (params.policy & ~SNP_POLICY_MASK_VALID) + if (params.policy & ~KVM_SNP_POLICY_MASK_VALID) return -EINVAL; /* Check for policy bits that must be set */ @@ -5085,10 +5077,10 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) /* Check if the SEV policy allows debugging */ if (sev_snp_guest(vcpu->kvm)) { - if (!(sev->policy & SNP_POLICY_DEBUG)) + if (!(sev->policy & SNP_POLICY_MASK_DEBUG)) return NULL; } else { - if (sev->policy & SEV_POLICY_NODBG) + if (sev->policy & SEV_POLICY_MASK_NODBG) return NULL; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6765a5e433ce..a9f6c1ece63d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -117,9 +117,6 @@ struct kvm_sev_info { cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ }; -#define SEV_POLICY_NODBG BIT_ULL(0) -#define SNP_POLICY_DEBUG BIT_ULL(19) - struct kvm_svm { struct kvm kvm; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index e0dbcb4b4fd9..27c92543bf38 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -14,6 +14,25 @@ #include +/* As defined by SEV API, under "Guest Policy". */ +#define SEV_POLICY_MASK_NODBG BIT(0) +#define SEV_POLICY_MASK_NOKS BIT(1) +#define SEV_POLICY_MASK_ES BIT(2) +#define SEV_POLICY_MASK_NOSEND BIT(3) +#define SEV_POLICY_MASK_DOMAIN BIT(4) +#define SEV_POLICY_MASK_SEV BIT(5) +#define SEV_POLICY_MASK_API_MAJOR GENMASK(23, 16) +#define SEV_POLICY_MASK_API_MINOR GENMASK(31, 24) + +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ /** -- cgit v1.2.3 From c9434e64e8b4d17511f514f7495008f573595e3e Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:50 -0500 Subject: crypto: ccp - Add an API to return the supported SEV-SNP policy bits Supported policy bits are dependent on the level of SEV firmware that is currently running. Create an API to return the supported policy bits for the current level of firmware. Signed-off-by: Tom Lendacky Acked-by: Herbert Xu Link: https://patch.msgid.link/e3f711366ddc22e3dd215c987fd2e28dc1c07f54.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- drivers/crypto/ccp/sev-dev.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/psp-sev.h | 18 ++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'include/linux') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 0d13d47c164b..db7c7c50cebc 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -2777,6 +2777,43 @@ void sev_platform_shutdown(void) } EXPORT_SYMBOL_GPL(sev_platform_shutdown); +u64 sev_get_snp_policy_bits(void) +{ + struct psp_device *psp = psp_master; + struct sev_device *sev; + u64 policy_bits; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return 0; + + if (!psp || !psp->sev_data) + return 0; + + sev = psp->sev_data; + + policy_bits = SNP_POLICY_MASK_BASE; + + if (sev->snp_plat_status.feature_info) { + if (sev->snp_feat_info_0.ecx & SNP_RAPL_DISABLE_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_RAPL_DIS; + + if (sev->snp_feat_info_0.ecx & SNP_CIPHER_TEXT_HIDING_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM; + + if (sev->snp_feat_info_0.ecx & SNP_AES_256_XTS_POLICY_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_MEM_AES_256_XTS; + + if (sev->snp_feat_info_0.ecx & SNP_CXL_ALLOW_POLICY_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_CXL_ALLOW; + + if (sev_version_greater_or_equal(1, 58)) + policy_bits |= SNP_POLICY_MASK_PAGE_SWAP_DISABLE; + } + + return policy_bits; +} +EXPORT_SYMBOL_GPL(sev_get_snp_policy_bits); + void sev_dev_destroy(struct psp_device *psp) { struct sev_device *sev = psp->sev_data; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 27c92543bf38..abcdee256c65 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -32,6 +32,20 @@ #define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) #define SNP_POLICY_MASK_DEBUG BIT_ULL(19) #define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) +#define SNP_POLICY_MASK_CXL_ALLOW BIT_ULL(21) +#define SNP_POLICY_MASK_MEM_AES_256_XTS BIT_ULL(22) +#define SNP_POLICY_MASK_RAPL_DIS BIT_ULL(23) +#define SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM BIT_ULL(24) +#define SNP_POLICY_MASK_PAGE_SWAP_DISABLE BIT_ULL(25) + +/* Base SEV-SNP policy bitmask for minimum supported SEV firmware version */ +#define SNP_POLICY_MASK_BASE (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_MIGRATE_MA | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ @@ -868,7 +882,10 @@ struct snp_feature_info { u32 edx; } __packed; +#define SNP_RAPL_DISABLE_SUPPORTED BIT(2) #define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) +#define SNP_AES_256_XTS_POLICY_SUPPORTED BIT(4) +#define SNP_CXL_ALLOW_POLICY_SUPPORTED BIT(5) #ifdef CONFIG_CRYPTO_DEV_SP_PSP @@ -1014,6 +1031,7 @@ void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); void sev_platform_shutdown(void); bool sev_is_snp_ciphertext_hiding_supported(void); +u64 sev_get_snp_policy_bits(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ -- cgit v1.2.3 From 337b1b566db087347194e4543ddfdfa5645275cc Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 18:26:23 +0200 Subject: PCI: Fix restoring BARs on BAR resize rollback path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BAR resize operation is implemented in the pci_resize_resource() and pbus_reassign_bridge_resources() functions. pci_resize_resource() can be called either from __resource_resize_store() from sysfs or directly by the driver for the Endpoint Device. The pci_resize_resource() requires that caller has released the device resources that share the bridge window with the BAR to be resized as otherwise the bridge window is pinned in place and cannot be changed. pbus_reassign_bridge_resources() rolls back resources if the resize operation fails, but rollback is performed only for the bridge windows. Because releasing the device resources are done by the caller of the BAR resize interface, these functions performing the BAR resize do not have access to the device resources as they were before the resize. pbus_reassign_bridge_resources() could try __pci_bridge_assign_resources() after rolling back the bridge windows as they were, however, it will not guarantee the resource are assigned due to differences in how FW and the kernel assign the resources (alignment of the start address and tail). To perform rollback robustly, the BAR resize interface has to be altered to also release the device resources that share the bridge window with the BAR to be resized. Also, remove restoring from the entries failed list as saved list should now contain both the bridge windows and device resources so the extra restore is duplicated work. Some drivers (currently only amdgpu) want to prevent releasing some resources. Add exclude_bars param to pci_resize_resource() and make amdgpu pass its register BAR (BAR 2 or 5), which should never be released during resize operation. Normally 64-bit prefetchable resources do not share a bridge window with the 32-bit only register BAR, but there are various fallbacks in the resource assignment logic which may make the resources share the bridge window in rare cases. This change (together with the driver side changes) is to counter the resource releases that had to be done to prevent resource tree corruption in the ("PCI: Release assigned resource before restoring them") change. As such, it likely restores functionality in cases where device resources were released to avoid resource tree conflicts which appeared to be "working" when such conflicts were not correctly detected by the kernel. Reported-by: Simon Richter Link: https://lore.kernel.org/linux-pci/f9a8c975-f5d3-4dd2-988e-4371a1433a60@hogyros.de/ Reported-by: Alex Bennée Link: https://lore.kernel.org/linux-pci/874irqop6b.fsf@draig.linaro.org/ Signed-off-by: Ilpo Järvinen [bhelgaas: squash amdgpu BAR selection from https://lore.kernel.org/r/20251114103053.13778-1-ilpo.jarvinen@linux.intel.com] Signed-off-by: Bjorn Helgaas Tested-by: Alex Bennée # AVA, AMD GPU Reviewed-by: Christian König Link: https://patch.msgid.link/20251113162628.5946-7-ilpo.jarvinen@linux.intel.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +- drivers/gpu/drm/i915/gt/intel_region_lmem.c | 2 +- drivers/gpu/drm/xe/xe_vram.c | 2 +- drivers/pci/pci-sysfs.c | 17 +---- drivers/pci/pci.h | 4 +- drivers/pci/setup-bus.c | 100 +++++++++++++++++++++------- drivers/pci/setup-res.c | 23 ++----- include/linux/pci.h | 3 +- 8 files changed, 93 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7a899fb4de29..bf0bc38e1c47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1736,7 +1736,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) pci_release_resource(adev->pdev, 0); - r = pci_resize_resource(adev->pdev, 0, rbar_size); + r = pci_resize_resource(adev->pdev, 0, rbar_size, + (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5 + : 1 << 2); if (r == -ENOSPC) dev_info(adev->dev, "Not enough PCI address space for a large BAR."); diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c index 51bb27e10a4f..7699e8fcf5ed 100644 --- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c +++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c @@ -37,7 +37,7 @@ _resize_bar(struct drm_i915_private *i915, int resno, resource_size_t size) _release_bars(pdev); - ret = pci_resize_resource(pdev, resno, bar_size); + ret = pci_resize_resource(pdev, resno, bar_size, 0); if (ret) { drm_info(&i915->drm, "Failed to resize BAR%d to %dM (%pe)\n", resno, 1 << bar_size, ERR_PTR(ret)); diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c index b44ebf50fedb..00dd027057df 100644 --- a/drivers/gpu/drm/xe/xe_vram.c +++ b/drivers/gpu/drm/xe/xe_vram.c @@ -36,7 +36,7 @@ _resize_bar(struct xe_device *xe, int resno, resource_size_t size) if (pci_resource_len(pdev, resno)) pci_release_resource(pdev, resno); - ret = pci_resize_resource(pdev, resno, bar_size); + ret = pci_resize_resource(pdev, resno, bar_size, 0); if (ret) { drm_info(&xe->drm, "Failed to resize BAR%d to %dM (%pe). Consider enabling 'Resizable BAR' support in your BIOS\n", resno, 1 << bar_size, ERR_PTR(ret)); diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 9d6f74bd95f8..2a1b5456c2dc 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1599,18 +1599,13 @@ static ssize_t __resource_resize_store(struct device *dev, int n, { struct pci_dev *pdev = to_pci_dev(dev); struct pci_bus *bus = pdev->bus; - struct resource *b_win, *res; unsigned long size; - int ret, i; + int ret; u16 cmd; if (kstrtoul(buf, 0, &size) < 0) return -EINVAL; - b_win = pbus_select_window(bus, pci_resource_n(pdev, n)); - if (!b_win) - return -EINVAL; - device_lock(dev); if (dev->driver || pci_num_vf(pdev)) { ret = -EBUSY; @@ -1632,15 +1627,7 @@ static ssize_t __resource_resize_store(struct device *dev, int n, pci_remove_resource_files(pdev); - pci_dev_for_each_resource(pdev, res, i) { - if (i >= PCI_BRIDGE_RESOURCES) - break; - - if (b_win == pbus_select_window(bus, res)) - pci_release_resource(pdev, i); - } - - ret = pci_resize_resource(pdev, n, size); + ret = pci_resize_resource(pdev, n, size, 0); pci_assign_unassigned_bus_resources(bus); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index bf1a577e9623..9893ea12d1f2 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -421,8 +421,10 @@ enum pci_bar_type { struct device *pci_get_host_bridge_device(struct pci_dev *dev); void pci_put_host_bridge_device(struct device *dev); +void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size); +int pci_do_resource_release_and_resize(struct pci_dev *dev, int resno, int size, + int exclude_bars); unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge); -int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res); int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align); int pci_configure_extended_tags(struct pci_dev *dev, void *ign); diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 51f5e5a80b54..7e268960954b 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -2420,18 +2420,16 @@ EXPORT_SYMBOL_GPL(pci_assign_unassigned_bridge_resources); * release it when possible. If the bridge window contains assigned * resources, it cannot be released. */ -int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res) +static int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res, + struct list_head *saved) { unsigned long type = res->flags; struct pci_dev_resource *dev_res; struct pci_dev *bridge = NULL; - LIST_HEAD(saved); LIST_HEAD(added); LIST_HEAD(failed); unsigned int i; - int ret; - - down_read(&pci_bus_sem); + int ret = 0; while (!pci_is_root_bus(bus)) { bridge = bus->self; @@ -2443,9 +2441,9 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res) /* Ignore BARs which are still in use */ if (!res->child) { - ret = add_to_list(&saved, bridge, res, 0, 0); + ret = add_to_list(saved, bridge, res, 0, 0); if (ret) - goto cleanup; + return ret; pci_release_resource(bridge, i); } else { @@ -2468,34 +2466,78 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res) free_list(&added); if (!list_empty(&failed)) { - if (pci_required_resource_failed(&failed, type)) { + if (pci_required_resource_failed(&failed, type)) ret = -ENOSPC; - goto cleanup; - } - /* Only resources with unrelated types failed (again) */ free_list(&failed); + if (ret) + return ret; + + /* Only resources with unrelated types failed (again) */ } - list_for_each_entry(dev_res, &saved, list) { + list_for_each_entry(dev_res, saved, list) { struct pci_dev *dev = dev_res->dev; /* Skip the bridge we just assigned resources for */ if (bridge == dev) continue; + if (!dev->subordinate) + continue; + pci_setup_bridge(dev->subordinate); } - free_list(&saved); - up_read(&pci_bus_sem); return 0; +} -cleanup: - /* Restore size and flags */ - list_for_each_entry(dev_res, &failed, list) - restore_dev_resource(dev_res); - free_list(&failed); +int pci_do_resource_release_and_resize(struct pci_dev *pdev, int resno, int size, + int exclude_bars) +{ + struct resource *res = pci_resource_n(pdev, resno); + struct pci_dev_resource *dev_res; + struct pci_bus *bus = pdev->bus; + struct resource *b_win, *r; + LIST_HEAD(saved); + unsigned int i; + int ret = 0; + + b_win = pbus_select_window(bus, res); + if (!b_win) + return -EINVAL; + + pci_dev_for_each_resource(pdev, r, i) { + if (i >= PCI_BRIDGE_RESOURCES) + break; + + if (exclude_bars & BIT(i)) + continue; + if (b_win != pbus_select_window(bus, r)) + continue; + + ret = add_to_list(&saved, pdev, r, 0, 0); + if (ret) + goto restore; + pci_release_resource(pdev, i); + } + + pci_resize_resource_set_size(pdev, resno, size); + + if (!bus->self) + goto out; + + down_read(&pci_bus_sem); + ret = pbus_reassign_bridge_resources(bus, res, &saved); + if (ret) + goto restore; + +out: + up_read(&pci_bus_sem); + free_list(&saved); + return ret; + +restore: /* Revert to the old configuration */ list_for_each_entry(dev_res, &saved, list) { struct resource *res = dev_res->res; @@ -2510,13 +2552,21 @@ cleanup: restore_dev_resource(dev_res); - pci_claim_resource(dev, i); - pci_setup_bridge(dev->subordinate); - } - up_read(&pci_bus_sem); - free_list(&saved); + ret = pci_claim_resource(dev, i); + if (ret) + continue; - return ret; + if (i < PCI_BRIDGE_RESOURCES) { + const char *res_name = pci_resource_name(dev, i); + + pci_update_resource(dev, i); + pci_info(dev, "%s %pR: old value restored\n", + res_name, res); + } + if (dev->subordinate) + pci_setup_bridge(dev->subordinate); + } + goto out; } void pci_assign_unassigned_bus_resources(struct pci_bus *bus) diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 3d0b0b3f60c4..e4486d7030c0 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -444,8 +444,7 @@ static bool pci_resize_is_memory_decoding_enabled(struct pci_dev *dev, return cmd & PCI_COMMAND_MEMORY; } -static void pci_resize_resource_set_size(struct pci_dev *dev, int resno, - int size) +void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size) { resource_size_t res_size = pci_rebar_size_to_bytes(size); struct resource *res = pci_resource_n(dev, resno); @@ -456,9 +455,9 @@ static void pci_resize_resource_set_size(struct pci_dev *dev, int resno, resource_set_size(res, res_size); } -int pci_resize_resource(struct pci_dev *dev, int resno, int size) +int pci_resize_resource(struct pci_dev *dev, int resno, int size, + int exclude_bars) { - struct resource *res = pci_resource_n(dev, resno); struct pci_host_bridge *host; int old, ret; u32 sizes; @@ -468,10 +467,6 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size) if (host->preserve_config) return -ENOTSUPP; - /* Make sure the resource isn't assigned before resizing it. */ - if (!(res->flags & IORESOURCE_UNSET)) - return -EBUSY; - if (pci_resize_is_memory_decoding_enabled(dev, resno)) return -EBUSY; @@ -490,19 +485,13 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size) if (ret) return ret; - pci_resize_resource_set_size(dev, resno, size); - - /* Check if the new config works by trying to assign everything. */ - if (dev->bus->self) { - ret = pbus_reassign_bridge_resources(dev->bus, res); - if (ret) - goto error_resize; - } + ret = pci_do_resource_release_and_resize(dev, resno, size, exclude_bars); + if (ret) + goto error_resize; return 0; error_resize: pci_rebar_set_size(dev, resno, old); - pci_resize_resource_set_size(dev, resno, old); return ret; } EXPORT_SYMBOL(pci_resize_resource); diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..34ff295cd2e3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1428,7 +1428,8 @@ static inline int pci_rebar_bytes_to_size(u64 bytes) } u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); -int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size); +int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, + int exclude_bars); int pci_select_bars(struct pci_dev *dev, unsigned long flags); bool pci_device_is_present(struct pci_dev *pdev); void pci_ignore_hotplug(struct pci_dev *dev); -- cgit v1.2.3 From 876e15943e9205096441cbe520dc9ccf82df8344 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 20:00:44 +0200 Subject: PCI: Move pci_rebar_bytes_to_size() and clean it up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move pci_rebar_bytes_to_size() from include/linux/pci.h to rebar.c as it does not look very trivial and is not expected to be performance critical. Convert literals to use a newly added PCI_REBAR_MIN_SIZE define. Also add kernel doc for the function as the function is exported. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Michael J. Ruhl Link: https://patch.msgid.link/20251113180053.27944-3-ilpo.jarvinen@linux.intel.com --- drivers/pci/rebar.c | 23 +++++++++++++++++++++++ include/linux/pci.h | 10 +++------- 2 files changed, 26 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c index f6ed7e4893a7..f29810fe0a58 100644 --- a/drivers/pci/rebar.c +++ b/drivers/pci/rebar.c @@ -7,11 +7,34 @@ #include #include #include +#include #include +#include #include #include "pci.h" +#define PCI_REBAR_MIN_SIZE ((resource_size_t)SZ_1M) + +/** + * pci_rebar_bytes_to_size - Convert size in bytes to PCI BAR Size + * @bytes: size in bytes + * + * Convert size in bytes to encoded BAR Size in Resizable BAR Capability + * (PCIe r6.2, sec. 7.8.6.3). + * + * Return: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB) + */ +int pci_rebar_bytes_to_size(u64 bytes) +{ + int rebar_minsize = ilog2(PCI_REBAR_MIN_SIZE); + + bytes = roundup_pow_of_two(bytes); + + return max(ilog2(bytes), rebar_minsize) - rebar_minsize; +} +EXPORT_SYMBOL_GPL(pci_rebar_bytes_to_size); + void pci_rebar_init(struct pci_dev *pdev) { pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR); diff --git a/include/linux/pci.h b/include/linux/pci.h index 34ff295cd2e3..628dda63b9e0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1419,17 +1419,13 @@ void pcibios_reset_secondary_bus(struct pci_dev *dev); void pci_update_resource(struct pci_dev *dev, int resno); int __must_check pci_assign_resource(struct pci_dev *dev, int i); int pci_release_resource(struct pci_dev *dev, int resno); -static inline int pci_rebar_bytes_to_size(u64 bytes) -{ - bytes = roundup_pow_of_two(bytes); - - /* Return BAR size as defined in the resizable BAR specification */ - return max(ilog2(bytes), 20) - 20; -} +/* Resizable BAR related routines */ +int pci_rebar_bytes_to_size(u64 bytes); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); + int pci_select_bars(struct pci_dev *dev, unsigned long flags); bool pci_device_is_present(struct pci_dev *pdev); void pci_ignore_hotplug(struct pci_dev *dev); -- cgit v1.2.3 From a337869885083131e575c6367c679f4da4b68bb0 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 20:00:45 +0200 Subject: PCI: Move pci_rebar_size_to_bytes() and export it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pci_rebar_size_to_bytes() is in drivers/pci/pci.h but would be useful for endpoint drivers as well. Move the function to rebar.c and export it. In addition, convert the literal to where the number comes from (PCI_REBAR_MIN_SIZE). Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Link: https://patch.msgid.link/20251113180053.27944-4-ilpo.jarvinen@linux.intel.com --- drivers/pci/pci.h | 4 ---- drivers/pci/rebar.c | 12 ++++++++++++ include/linux/pci.h | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 41df35920632..a1e7dbeb0f2c 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -1024,10 +1024,6 @@ void pci_rebar_init(struct pci_dev *pdev); void pci_restore_rebar_state(struct pci_dev *pdev); int pci_rebar_get_current_size(struct pci_dev *pdev, int bar); int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size); -static inline u64 pci_rebar_size_to_bytes(int size) -{ - return 1ULL << (size + 20); -} struct device_node; diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c index f29810fe0a58..a81cb3fcddef 100644 --- a/drivers/pci/rebar.c +++ b/drivers/pci/rebar.c @@ -35,6 +35,18 @@ int pci_rebar_bytes_to_size(u64 bytes) } EXPORT_SYMBOL_GPL(pci_rebar_bytes_to_size); +/** + * pci_rebar_size_to_bytes - Convert encoded BAR Size to size in bytes + * @size: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB) + * + * Return: BAR size in bytes + */ +resource_size_t pci_rebar_size_to_bytes(int size) +{ + return 1ULL << (size + ilog2(PCI_REBAR_MIN_SIZE)); +} +EXPORT_SYMBOL_GPL(pci_rebar_size_to_bytes); + void pci_rebar_init(struct pci_dev *pdev) { pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR); diff --git a/include/linux/pci.h b/include/linux/pci.h index 628dda63b9e0..33b27e0c4f3e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1422,6 +1422,7 @@ int pci_release_resource(struct pci_dev *dev, int resno); /* Resizable BAR related routines */ int pci_rebar_bytes_to_size(u64 bytes); +resource_size_t pci_rebar_size_to_bytes(int size); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); -- cgit v1.2.3 From bb1fabd0d94efc29f88f86fb996c40ac06db3669 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 20:00:47 +0200 Subject: PCI: Add pci_rebar_size_supported() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many callers of pci_rebar_get_possible_sizes() are interested in finding out if a particular encoded BAR Size (PCIe r7.0, sec 7.8.6.3) is supported by the particular BAR. Add pci_rebar_size_supported() into PCI core to make it easy for the drivers to determine if the BAR size is supported or not. Use the new function in pci_resize_resource() and in pci_iov_vf_bar_set_size(). Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Andi Shyti Link: https://patch.msgid.link/20251113180053.27944-6-ilpo.jarvinen@linux.intel.com --- drivers/pci/iov.c | 8 +------- drivers/pci/rebar.c | 25 +++++++++++++++++++------ include/linux/pci.h | 1 + 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 04b675e90963..71ed85d38508 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -1339,19 +1339,13 @@ EXPORT_SYMBOL_GPL(pci_sriov_configure_simple); */ int pci_iov_vf_bar_set_size(struct pci_dev *dev, int resno, int size) { - u32 sizes; - if (!pci_resource_is_iov(resno)) return -EINVAL; if (pci_iov_is_memory_decoding_enabled(dev)) return -EBUSY; - sizes = pci_rebar_get_possible_sizes(dev, resno); - if (!sizes) - return -ENOTSUPP; - - if (!(sizes & BIT(size))) + if (!pci_rebar_size_supported(dev, resno, size)) return -EINVAL; return pci_rebar_set_size(dev, resno, size); diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c index 399660cb12fa..1c90b606b8d4 100644 --- a/drivers/pci/rebar.c +++ b/drivers/pci/rebar.c @@ -3,6 +3,7 @@ * PCI Resizable BAR Extended Capability handling. */ +#include #include #include #include @@ -124,6 +125,23 @@ u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar) } EXPORT_SYMBOL(pci_rebar_get_possible_sizes); +/** + * pci_rebar_size_supported - check if size is supported for BAR + * @pdev: PCI device + * @bar: BAR to check + * @size: encoded size as defined in the PCIe spec (0=1MB, 31=128TB) + * + * Return: %true if @bar is resizable and @size is supported, otherwise + * %false. + */ +bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size) +{ + u64 sizes = pci_rebar_get_possible_sizes(pdev, bar); + + return BIT(size) & sizes; +} +EXPORT_SYMBOL_GPL(pci_rebar_size_supported); + /** * pci_rebar_get_current_size - get the current size of a Resizable BAR * @pdev: PCI device @@ -252,7 +270,6 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size, { struct pci_host_bridge *host; int old, ret; - u32 sizes; /* Check if we must preserve the firmware's resource assignment */ host = pci_find_host_bridge(dev->bus); @@ -262,11 +279,7 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size, if (pci_resize_is_memory_decoding_enabled(dev, resno)) return -EBUSY; - sizes = pci_rebar_get_possible_sizes(dev, resno); - if (!sizes) - return -ENOTSUPP; - - if (!(sizes & BIT(size))) + if (!pci_rebar_size_supported(dev, resno, size)) return -EINVAL; old = pci_rebar_get_current_size(dev, resno); diff --git a/include/linux/pci.h b/include/linux/pci.h index 33b27e0c4f3e..0ef827cfaf0c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1424,6 +1424,7 @@ int pci_release_resource(struct pci_dev *dev, int resno); int pci_rebar_bytes_to_size(u64 bytes); resource_size_t pci_rebar_size_to_bytes(int size); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); +bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); -- cgit v1.2.3 From 1c680f2acdbb3b64965962ca060a6daa6379575d Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 20:00:50 +0200 Subject: PCI: Add pci_rebar_get_max_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add pci_rebar_get_max_size() to allow simplifying code that wants to know the maximum possible size for a Resizable BAR. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Link: https://patch.msgid.link/20251113180053.27944-9-ilpo.jarvinen@linux.intel.com --- drivers/pci/rebar.c | 23 +++++++++++++++++++++++ include/linux/pci.h | 1 + 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c index 1c90b606b8d4..e99b89bd5e51 100644 --- a/drivers/pci/rebar.c +++ b/drivers/pci/rebar.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -142,6 +143,28 @@ bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size) } EXPORT_SYMBOL_GPL(pci_rebar_size_supported); +/** + * pci_rebar_get_max_size - get the maximum supported size of a BAR + * @pdev: PCI device + * @bar: BAR to query + * + * Get the largest supported size of a resizable BAR as a size. + * + * Return: the encoded maximum BAR size as defined in the PCIe spec + * (0=1MB, 31=128TB), or %-NOENT on error. + */ +int pci_rebar_get_max_size(struct pci_dev *pdev, int bar) +{ + u32 sizes; + + sizes = pci_rebar_get_possible_sizes(pdev, bar); + if (!sizes) + return -ENOENT; + + return __fls(sizes); +} +EXPORT_SYMBOL_GPL(pci_rebar_get_max_size); + /** * pci_rebar_get_current_size - get the current size of a Resizable BAR * @pdev: PCI device diff --git a/include/linux/pci.h b/include/linux/pci.h index 0ef827cfaf0c..898bc3a4e8e7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1425,6 +1425,7 @@ int pci_rebar_bytes_to_size(u64 bytes); resource_size_t pci_rebar_size_to_bytes(int size); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size); +int pci_rebar_get_max_size(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); -- cgit v1.2.3 From bf0a90fc907e47344f88e5b9b241082184dbac27 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 20:00:53 +0200 Subject: PCI: Convert BAR sizes bitmasks to u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe r7.0, sec 7.8.6, defines resizable BAR sizes beyond the currently supported maximum of 128TB, which will require more than u32 to store the entire bitmask. Convert Resizable BAR related functions to use u64 bitmask for BAR sizes to make the typing more future-proof. The support for the larger BAR sizes themselves is not added at this point. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Link: https://patch.msgid.link/20251113180053.27944-12-ilpo.jarvinen@linux.intel.com --- drivers/gpu/drm/xe/xe_vram.c | 2 +- drivers/pci/iov.c | 2 +- drivers/pci/pci-sysfs.c | 2 +- drivers/pci/rebar.c | 4 ++-- include/linux/pci.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c index 524469f8a4bd..10f8a73e190b 100644 --- a/drivers/gpu/drm/xe/xe_vram.c +++ b/drivers/gpu/drm/xe/xe_vram.c @@ -69,7 +69,7 @@ static void resize_vram_bar(struct xe_device *xe) if (!pci_rebar_size_supported(pdev, LMEM_BAR, rebar_size)) { drm_info(&xe->drm, - "Requested size: %lluMiB is not supported by rebar sizes: 0x%x. Leaving default: %lluMiB\n", + "Requested size: %lluMiB is not supported by rebar sizes: 0x%llx. Leaving default: %lluMiB\n", (u64)pci_rebar_size_to_bytes(rebar_size) >> 20, pci_rebar_get_possible_sizes(pdev, LMEM_BAR), (u64)current_size >> 20); diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 71ed85d38508..00784a60ba80 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -1367,7 +1367,7 @@ EXPORT_SYMBOL_GPL(pci_iov_vf_bar_set_size); u32 pci_iov_vf_bar_get_sizes(struct pci_dev *dev, int resno, int num_vfs) { u64 vf_len = pci_resource_len(dev, resno); - u32 sizes; + u64 sizes; if (!num_vfs) return 0; diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 2a1b5456c2dc..cb512bf0df7c 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1587,7 +1587,7 @@ static ssize_t __resource_resize_show(struct device *dev, int n, char *buf) pci_config_pm_runtime_get(pdev); ret = sysfs_emit(buf, "%016llx\n", - (u64)pci_rebar_get_possible_sizes(pdev, n)); + pci_rebar_get_possible_sizes(pdev, n)); pci_config_pm_runtime_put(pdev); diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c index e99b89bd5e51..7f6dece19138 100644 --- a/drivers/pci/rebar.c +++ b/drivers/pci/rebar.c @@ -105,7 +105,7 @@ static int pci_rebar_find_pos(struct pci_dev *pdev, int bar) * Return: A bitmask of possible sizes (bit 0=1MB, bit 31=128TB), or %0 if * BAR isn't resizable. */ -u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar) +u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar) { int pos; u32 cap; @@ -155,7 +155,7 @@ EXPORT_SYMBOL_GPL(pci_rebar_size_supported); */ int pci_rebar_get_max_size(struct pci_dev *pdev, int bar) { - u32 sizes; + u64 sizes; sizes = pci_rebar_get_possible_sizes(pdev, bar); if (!sizes) diff --git a/include/linux/pci.h b/include/linux/pci.h index 898bc3a4e8e7..4b7f4c08b5c7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1423,7 +1423,7 @@ int pci_release_resource(struct pci_dev *dev, int resno); /* Resizable BAR related routines */ int pci_rebar_bytes_to_size(u64 bytes); resource_size_t pci_rebar_size_to_bytes(int size); -u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); +u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size); int pci_rebar_get_max_size(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, -- cgit v1.2.3 From 4518767be9089ea4f54754ad27364d6134fc46e2 Mon Sep 17 00:00:00 2001 From: Jianyun Gao Date: Sat, 27 Sep 2025 17:34:10 +0800 Subject: time: Fix a few typos in time[r] related code comments Signed-off-by: Jianyun Gao Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20250927093411.1509275-1-jianyungao89@gmail.com --- include/linux/delay.h | 8 ++++---- kernel/time/posix-timers.c | 2 +- kernel/time/timer_migration.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/delay.h b/include/linux/delay.h index 89866bab100d..46412c00033a 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -68,7 +68,7 @@ void usleep_range_state(unsigned long min, unsigned long max, * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * - * For basic information please refere to usleep_range_state(). + * For basic information please refer to usleep_range_state(). * * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. */ @@ -82,10 +82,10 @@ static inline void usleep_range(unsigned long min, unsigned long max) * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * - * For basic information please refere to usleep_range_state(). + * For basic information please refer to usleep_range_state(). * * The sleeping task has the state TASK_IDLE during the sleep to prevent - * contribution to the load avarage. + * contribution to the load average. */ static inline void usleep_range_idle(unsigned long min, unsigned long max) { @@ -96,7 +96,7 @@ static inline void usleep_range_idle(unsigned long min, unsigned long max) * ssleep - wrapper for seconds around msleep * @seconds: Requested sleep duration in seconds * - * Please refere to msleep() for detailed information. + * Please refer to msleep() for detailed information. */ static inline void ssleep(unsigned int seconds) { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index aa3120104a51..36dbb8146517 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1242,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, * sys_clock_settime(). The kernel internal timekeeping is always using * nanoseconds precision independent of the clocksource device which is * used to read the time from. The resolution of that device only - * affects the presicion of the time returned by sys_clock_gettime(). + * affects the precision of the time returned by sys_clock_gettime(). * * Returns: * 0 Success. @tp contains the resolution diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 19ddfa96b9df..57e38674e56e 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -708,7 +708,7 @@ void tmigr_cpu_activate(void) /* * Returns true, if there is nothing to be propagated to the next level * - * @data->firstexp is set to expiry of first gobal event of the (top level of + * @data->firstexp is set to expiry of first global event of the (top level of * the) hierarchy, but only when hierarchy is completely idle. * * The child and group states need to be read under the lock, to prevent a race -- cgit v1.2.3 From ef8057b07c72a817537856b98d6e7493b9404eaf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Nov 2025 20:33:33 +0100 Subject: PM: runtime: Wrapper macros for ACQUIRE()/ACQUIRE_ERR() Add wrapper macros for ACQUIRE()/ACQUIRE_ERR() and runtime PM usage counter guards introduced recently: pm_runtime_active_try, pm_runtime_active_auto_try, pm_runtime_active_try_enabled, and pm_runtime_active_auto_try_enabled. The new macros should be more straightforward to use. For example, they can be used for rewriting a piece of code like below: ACQUIRE(pm_runtime_active_try, pm)(dev); if ((ret = ACQUIRE_ERR(pm_runtime_active_try, &pm))) return ret; in the following way: PM_RUNTIME_ACQUIRE(dev, pm); if ((ret = PM_RUNTIME_ACQUIRE_ERR(&pm))) return ret; If the original code does not care about the specific error code returned when attepmting to resume the device: ACQUIRE(pm_runtime_active_try, pm)(dev); if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) return -ENXIO; it may be changed like this: PM_RUNTIME_ACQUIRE(dev, pm); if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; Link: https://lore.kernel.org/linux-pm/5068916.31r3eYUQgx@rafael.j.wysocki/ Signed-off-by: Rafael J. Wysocki Reviewed-by: Dan Williams Reviewed-by: Dhruva Gole Reviewed-by: Jonathan Cameron Reviewed-by: Frank Li Link: https://patch.msgid.link/3400866.aeNJFYEL58@rafael.j.wysocki --- include/linux/pm_runtime.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 0b436e15f4cd..911d7a4d32c1 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -637,6 +637,30 @@ DEFINE_GUARD_COND(pm_runtime_active_auto, _try, DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, pm_runtime_resume_and_get(_T), _RET == 0) +/* ACQUIRE() wrapper macros for the guards defined above. */ + +#define PM_RUNTIME_ACQUIRE(_dev, _var) \ + ACQUIRE(pm_runtime_active_try, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_AUTOSUSPEND(_dev, _var) \ + ACQUIRE(pm_runtime_active_auto_try, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_IF_ENABLED(_dev, _var) \ + ACQUIRE(pm_runtime_active_try_enabled, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_IF_ENABLED_AUTOSUSPEND(_dev, _var) \ + ACQUIRE(pm_runtime_active_auto_try_enabled, _var)(_dev) + +/* + * ACQUIRE_ERR() wrapper macro for guard pm_runtime_active. + * + * Always check PM_RUNTIME_ACQUIRE_ERR() after using one of the + * PM_RUNTIME_ACQUIRE*() macros defined above (yes, it can be used with + * any of them) and if it is nonzero, avoid accessing the given device. + */ +#define PM_RUNTIME_ACQUIRE_ERR(_var_ptr) \ + ACQUIRE_ERR(pm_runtime_active, _var_ptr) + /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. -- cgit v1.2.3 From 1dcb98bbb7538d4b9015d47c934acdf5ea86045c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Nov 2025 15:33:41 -1000 Subject: sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs With the buddy lockup detector, smp_processor_id() returns the detecting CPU, not the locked CPU, making scx_hardlockup()'s printouts confusing. Pass the locked CPU number from watchdog_hardlockup_check() as a parameter instead. Also add kerneldoc comments to handle_lockup(), scx_hardlockup(), and scx_rcu_cpu_stall() documenting their return value semantics. Suggested-by: Doug Anderson Reviewed-by: Douglas Anderson Acked-by: Andrea Righi Reviewed-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 4 ++-- kernel/sched/ext.c | 25 ++++++++++++++++++++++--- kernel/watchdog.c | 2 +- 3 files changed, 25 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 70ee5c28a74d..bcb962d5ee7d 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -230,7 +230,7 @@ struct sched_ext_entity { void sched_ext_dead(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); void scx_softlockup(u32 dur_s); -bool scx_hardlockup(void); +bool scx_hardlockup(int cpu); bool scx_rcu_cpu_stall(void); #else /* !CONFIG_SCHED_CLASS_EXT */ @@ -238,7 +238,7 @@ bool scx_rcu_cpu_stall(void); static inline void sched_ext_dead(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} -static inline bool scx_hardlockup(void) { return false; } +static inline bool scx_hardlockup(int cpu) { return false; } static inline bool scx_rcu_cpu_stall(void) { return false; } #endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8a3b8f64a06b..918573f3f088 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3687,6 +3687,17 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) return false; } +/** + * handle_lockup - sched_ext common lockup handler + * @fmt: format string + * + * Called on system stall or lockup condition and initiates abort of sched_ext + * if enabled, which may resolve the reported lockup. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the lockup. %false if sched_ext is not enabled or abort was already + * initiated by someone else. + */ static __printf(1, 2) bool handle_lockup(const char *fmt, ...) { struct scx_sched *sch; @@ -3718,6 +3729,10 @@ static __printf(1, 2) bool handle_lockup(const char *fmt, ...) * that may not be caused by the current BPF scheduler, try kicking out the * current scheduler in an attempt to recover the system to a good state before * issuing panics. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported RCU stall. %false if sched_ext is not enabled or someone + * else already initiated abort. */ bool scx_rcu_cpu_stall(void) { @@ -3750,14 +3765,18 @@ void scx_softlockup(u32 dur_s) * numerous affinitized tasks in a single queue and directing all CPUs at it. * Try kicking out the current scheduler in an attempt to recover the system to * a good state before taking more drastic actions. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported hardlockdup. %false if sched_ext is not enabled or + * someone else already initiated abort. */ -bool scx_hardlockup(void) +bool scx_hardlockup(int cpu) { - if (!handle_lockup("hard lockup - CPU %d", smp_processor_id())) + if (!handle_lockup("hard lockup - CPU %d", cpu)) return false; printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", - smp_processor_id()); + cpu); return true; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8dfac4a8f587..873020a2a581 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -203,7 +203,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) * only once when sched_ext is enabled and will immediately * abort the BPF scheduler and print out a warning message. */ - if (scx_hardlockup()) + if (scx_hardlockup(cpu)) return; /* Only print hardlockups once. */ -- cgit v1.2.3 From f86e51399c2a911a5b01d441de513f17bf773856 Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Thu, 13 Nov 2025 17:02:27 -0800 Subject: PCI/IDE: Add Address Association Register setup for downstream MMIO The address ranges for downstream Address Association Registers need to cover memory addresses for all functions (PFs/VFs/downstream devices) managed by a Device Security Manager (DSM). The proposed solution is get the memory (32-bit only) range and prefetchable-memory (64-bit capable) range from the immediate ancestor downstream port (either the direct-attach RP or deepest switch port when switch attached). Similar to RID association, address associations will be set by default if hardware sets 'Number of Address Association Register Blocks' in the 'Selective IDE Stream Capability Register' to a non-zero value. TSM drivers can opt-out of the settings by zero'ing out unwanted / unsupported address ranges. E.g. TDX Connect only supports prefetachable (64-bit capable) memory ranges for the Address Association setting. If the immediate downstream port provides both a memory range and prefetchable-memory range, but the IDE partner port only provides 1 Address Association Register block then the TSM driver can pick which range to associate, or let the PCI core prioritize memory. Note, the Address Association Register setup for upstream requests is still uncertain so is not included. Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Co-developed-by: Arto Merilainen Signed-off-by: Arto Merilainen Signed-off-by: Xu Yilun Co-developed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251114010227.567693-1-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/ide.c | 117 ++++++++++++++++++++++++++++++++++++++++++++---- include/linux/pci-ide.h | 32 +++++++++++++ include/linux/pci.h | 5 +++ 3 files changed, 145 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c index da5b1acccbb4..2e0856a4307b 100644 --- a/drivers/pci/ide.c +++ b/drivers/pci/ide.c @@ -155,8 +155,11 @@ struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev) { /* EP, RP, + HB Stream allocation */ struct stream_index __stream[PCI_IDE_HB + 1]; + struct pci_bus_region pref_assoc = { 0, -1 }; + struct pci_bus_region mem_assoc = { 0, -1 }; + struct resource *mem, *pref; struct pci_host_bridge *hb; - struct pci_dev *rp; + struct pci_dev *rp, *br; int num_vf, rid_end; if (!pci_is_pcie(pdev)) @@ -197,6 +200,21 @@ struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev) else rid_end = pci_dev_id(pdev); + br = pci_upstream_bridge(pdev); + if (!br) + return NULL; + + /* + * Check if the device consumes memory and/or prefetch-memory. Setup + * downstream address association ranges for each. + */ + mem = pci_resource_n(br, PCI_BRIDGE_MEM_WINDOW); + pref = pci_resource_n(br, PCI_BRIDGE_PREF_MEM_WINDOW); + if (resource_assigned(mem)) + pcibios_resource_to_bus(br->bus, &mem_assoc, mem); + if (resource_assigned(pref)) + pcibios_resource_to_bus(br->bus, &pref_assoc, pref); + *ide = (struct pci_ide) { .pdev = pdev, .partner = { @@ -204,11 +222,16 @@ struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev) .rid_start = pci_dev_id(rp), .rid_end = pci_dev_id(rp), .stream_index = no_free_ptr(ep_stream)->stream_index, + /* Disable upstream address association */ + .mem_assoc = { 0, -1 }, + .pref_assoc = { 0, -1 }, }, [PCI_IDE_RP] = { .rid_start = pci_dev_id(pdev), .rid_end = rid_end, .stream_index = no_free_ptr(rp_stream)->stream_index, + .mem_assoc = mem_assoc, + .pref_assoc = pref_assoc, }, }, .host_bridge_stream = no_free_ptr(hb_stream)->stream_index, @@ -385,6 +408,63 @@ static void set_ide_sel_ctl(struct pci_dev *pdev, struct pci_ide *ide, pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, val); } +#define SEL_ADDR1_LOWER GENMASK(31, 20) +#define SEL_ADDR_UPPER GENMASK_ULL(63, 32) +#define PREP_PCI_IDE_SEL_ADDR1(base, limit) \ + (FIELD_PREP(PCI_IDE_SEL_ADDR_1_VALID, 1) | \ + FIELD_PREP(PCI_IDE_SEL_ADDR_1_BASE_LOW, \ + FIELD_GET(SEL_ADDR1_LOWER, (base))) | \ + FIELD_PREP(PCI_IDE_SEL_ADDR_1_LIMIT_LOW, \ + FIELD_GET(SEL_ADDR1_LOWER, (limit)))) + +static void mem_assoc_to_regs(struct pci_bus_region *region, + struct pci_ide_regs *regs, int idx) +{ + /* convert to u64 range for bitfield size checks */ + struct range r = { region->start, region->end }; + + regs->addr[idx].assoc1 = PREP_PCI_IDE_SEL_ADDR1(r.start, r.end); + regs->addr[idx].assoc2 = FIELD_GET(SEL_ADDR_UPPER, r.end); + regs->addr[idx].assoc3 = FIELD_GET(SEL_ADDR_UPPER, r.start); +} + +/** + * pci_ide_stream_to_regs() - convert IDE settings to association register values + * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port + * @ide: registered IDE settings descriptor + * @regs: output register values + */ +static void pci_ide_stream_to_regs(struct pci_dev *pdev, struct pci_ide *ide, + struct pci_ide_regs *regs) +{ + struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + int assoc_idx = 0; + + memset(regs, 0, sizeof(*regs)); + + if (!settings) + return; + + regs->rid1 = FIELD_PREP(PCI_IDE_SEL_RID_1_LIMIT, settings->rid_end); + + regs->rid2 = FIELD_PREP(PCI_IDE_SEL_RID_2_VALID, 1) | + FIELD_PREP(PCI_IDE_SEL_RID_2_BASE, settings->rid_start) | + FIELD_PREP(PCI_IDE_SEL_RID_2_SEG, pci_ide_domain(pdev)); + + if (pdev->nr_ide_mem && pci_bus_region_size(&settings->mem_assoc)) { + mem_assoc_to_regs(&settings->mem_assoc, regs, assoc_idx); + assoc_idx++; + } + + if (pdev->nr_ide_mem > assoc_idx && + pci_bus_region_size(&settings->pref_assoc)) { + mem_assoc_to_regs(&settings->pref_assoc, regs, assoc_idx); + assoc_idx++; + } + + regs->nr_addr = assoc_idx; +} + /** * pci_ide_stream_setup() - program settings to Selective IDE Stream registers * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port @@ -398,22 +478,34 @@ static void set_ide_sel_ctl(struct pci_dev *pdev, struct pci_ide *ide, void pci_ide_stream_setup(struct pci_dev *pdev, struct pci_ide *ide) { struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + struct pci_ide_regs regs; int pos; - u32 val; if (!settings) return; + pci_ide_stream_to_regs(pdev, ide, ®s); + pos = sel_ide_offset(pdev, settings); - val = FIELD_PREP(PCI_IDE_SEL_RID_1_LIMIT, settings->rid_end); - pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, val); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, regs.rid1); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, regs.rid2); - val = FIELD_PREP(PCI_IDE_SEL_RID_2_VALID, 1) | - FIELD_PREP(PCI_IDE_SEL_RID_2_BASE, settings->rid_start) | - FIELD_PREP(PCI_IDE_SEL_RID_2_SEG, pci_ide_domain(pdev)); + for (int i = 0; i < regs.nr_addr; i++) { + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_1(i), + regs.addr[i].assoc1); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_2(i), + regs.addr[i].assoc2); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_3(i), + regs.addr[i].assoc3); + } - pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, val); + /* clear extra unused address association blocks */ + for (int i = regs.nr_addr; i < pdev->nr_ide_mem; i++) { + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_1(i), 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_2(i), 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_3(i), 0); + } /* * Setup control register early for devices that expect @@ -436,7 +528,7 @@ EXPORT_SYMBOL_GPL(pci_ide_stream_setup); void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide) { struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); - int pos; + int pos, i; if (!settings) return; @@ -444,6 +536,13 @@ void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide) pos = sel_ide_offset(pdev, settings); pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, 0); + + for (i = 0; i < pdev->nr_ide_mem; i++) { + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_1(i), 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_2(i), 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_3(i), 0); + } + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, 0); pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, 0); settings->setup = 0; diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index d0f10f3c89fc..93194338e4d0 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -28,21 +28,53 @@ enum pci_ide_partner_select { * @rid_start: Partner Port Requester ID range start * @rid_end: Partner Port Requester ID range end * @stream_index: Selective IDE Stream Register Block selection + * @mem_assoc: PCI bus memory address association for targeting peer partner + * @pref_assoc: PCI bus prefetchable memory address association for + * targeting peer partner * @default_stream: Endpoint uses this stream for all upstream TLPs regardless of * address and RID association registers * @setup: flag to track whether to run pci_ide_stream_teardown() for this * partner slot * @enable: flag whether to run pci_ide_stream_disable() for this partner slot + * + * By default, pci_ide_stream_alloc() initializes @mem_assoc and @pref_assoc + * with the immediate ancestor downstream port memory ranges (i.e. Type 1 + * Configuration Space Header values). Caller may zero size ({0, -1}) the range + * to drop it from consideration at pci_ide_stream_setup() time. */ struct pci_ide_partner { u16 rid_start; u16 rid_end; u8 stream_index; + struct pci_bus_region mem_assoc; + struct pci_bus_region pref_assoc; unsigned int default_stream:1; unsigned int setup:1; unsigned int enable:1; }; +/** + * struct pci_ide_regs - Hardware register association settings for Selective + * IDE Streams + * @rid1: IDE RID Association Register 1 + * @rid2: IDE RID Association Register 2 + * @addr: Up to two address association blocks (IDE Address Association Register + * 1 through 3) for MMIO and prefetchable MMIO + * @nr_addr: Number of address association blocks initialized + * + * See pci_ide_stream_to_regs() + */ +struct pci_ide_regs { + u32 rid1; + u32 rid2; + struct { + u32 assoc1; + u32 assoc2; + u32 assoc3; + } addr[2]; + int nr_addr; +}; + /** * struct pci_ide - PCIe Selective IDE Stream descriptor * @pdev: PCIe Endpoint in the pci_ide_partner pair diff --git a/include/linux/pci.h b/include/linux/pci.h index 2c8dbae4916c..ba39ca78b382 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -870,6 +870,11 @@ struct pci_bus_region { pci_bus_addr_t end; }; +static inline pci_bus_addr_t pci_bus_region_size(const struct pci_bus_region *region) +{ + return region->end - region->start + 1; +} + struct pci_dynids { spinlock_t lock; /* Protects list, index */ struct list_head list; /* For IDs added at runtime */ -- cgit v1.2.3 From 079115370d00c78ef69b31dd15def90adf2aa579 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:43 -0800 Subject: PCI/IDE: Initialize an ID for all IDE streams The PCIe spec defines two types of streams - selective and link. Each stream has an ID from the same bucket so a stream ID does not tell the type. The spec defines an "enable" bit for every stream and required stream IDs to be unique among all enabled stream but there is no such requirement for disabled streams. However, when IDE_KM is programming keys, an IDE-capable device needs to know the type of stream being programmed to write it directly to the hardware as keys are relatively large, possibly many of them and devices often struggle with keeping around rather big data not being used. Walk through all streams on a device and initialise the IDs to some unique number, both link and selective. The weakest part of this proposal is the host bridge ide_stream_ids_ida. Technically, a Stream ID only needs to be unique within a given partner pair. However, with "anonymous" / unassigned streams there is no convenient place to track the available ids. Proceed with an ida in the host bridge for now, but consider moving this tracking to be an ide_stream_ids_ida per device. Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-6-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/ide.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 2 + drivers/pci/remove.c | 1 + include/linux/pci-ide.h | 6 +++ include/linux/pci.h | 1 + 5 files changed, 139 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c index 2e0856a4307b..f0ef474e1a0d 100644 --- a/drivers/pci/ide.c +++ b/drivers/pci/ide.c @@ -35,8 +35,50 @@ static int sel_ide_offset(struct pci_dev *pdev, settings->stream_index, pdev->nr_ide_mem); } +static bool reserve_stream_index(struct pci_dev *pdev, u8 idx) +{ + int ret; + + ret = ida_alloc_range(&pdev->ide_stream_ida, idx, idx, GFP_KERNEL); + return ret >= 0; +} + +static bool reserve_stream_id(struct pci_host_bridge *hb, u8 id) +{ + int ret; + + ret = ida_alloc_range(&hb->ide_stream_ids_ida, id, id, GFP_KERNEL); + return ret >= 0; +} + +static bool claim_stream(struct pci_host_bridge *hb, u8 stream_id, + struct pci_dev *pdev, u8 stream_idx) +{ + dev_info(&hb->dev, "Stream ID %d active at init\n", stream_id); + if (!reserve_stream_id(hb, stream_id)) { + dev_info(&hb->dev, "Failed to claim %s Stream ID %d\n", + stream_id == PCI_IDE_RESERVED_STREAM_ID ? "reserved" : + "active", + stream_id); + return false; + } + + /* No stream index to reserve in the Link IDE case */ + if (!pdev) + return true; + + if (!reserve_stream_index(pdev, stream_idx)) { + pci_info(pdev, "Failed to claim active Selective Stream %d\n", + stream_idx); + return false; + } + + return true; +} + void pci_ide_init(struct pci_dev *pdev) { + struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus); u16 nr_link_ide, nr_ide_mem, nr_streams; u16 ide_cap; u32 val; @@ -83,6 +125,7 @@ void pci_ide_init(struct pci_dev *pdev) int pos = __sel_ide_offset(ide_cap, nr_link_ide, i, nr_ide_mem); int nr_assoc; u32 val; + u8 id; pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CAP, &val); @@ -98,6 +141,51 @@ void pci_ide_init(struct pci_dev *pdev) } nr_ide_mem = nr_assoc; + + /* + * Claim Stream IDs and Selective Stream blocks that are already + * active on the device + */ + pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CTL, &val); + id = FIELD_GET(PCI_IDE_SEL_CTL_ID, val); + if ((val & PCI_IDE_SEL_CTL_EN) && + !claim_stream(hb, id, pdev, i)) + return; + } + + /* Reserve link stream-ids that are already active on the device */ + for (u16 i = 0; i < nr_link_ide; ++i) { + int pos = ide_cap + PCI_IDE_LINK_STREAM_0 + i * PCI_IDE_LINK_BLOCK_SIZE; + u8 id; + + pci_read_config_dword(pdev, pos + PCI_IDE_LINK_CTL_0, &val); + id = FIELD_GET(PCI_IDE_LINK_CTL_ID, val); + if ((val & PCI_IDE_LINK_CTL_EN) && + !claim_stream(hb, id, NULL, -1)) + return; + } + + for (u16 i = 0; i < nr_streams; i++) { + int pos = __sel_ide_offset(ide_cap, nr_link_ide, i, nr_ide_mem); + + pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CAP, &val); + if (val & PCI_IDE_SEL_CTL_EN) + continue; + val &= ~PCI_IDE_SEL_CTL_ID; + val |= FIELD_PREP(PCI_IDE_SEL_CTL_ID, PCI_IDE_RESERVED_STREAM_ID); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, val); + } + + for (u16 i = 0; i < nr_link_ide; ++i) { + int pos = ide_cap + PCI_IDE_LINK_STREAM_0 + + i * PCI_IDE_LINK_BLOCK_SIZE; + + pci_read_config_dword(pdev, pos, &val); + if (val & PCI_IDE_LINK_CTL_EN) + continue; + val &= ~PCI_IDE_LINK_CTL_ID; + val |= FIELD_PREP(PCI_IDE_LINK_CTL_ID, PCI_IDE_RESERVED_STREAM_ID); + pci_write_config_dword(pdev, pos, val); } pdev->ide_cap = ide_cap; @@ -301,6 +389,28 @@ void pci_ide_stream_release(struct pci_ide *ide) } EXPORT_SYMBOL_GPL(pci_ide_stream_release); +struct pci_ide_stream_id { + struct pci_host_bridge *hb; + u8 stream_id; +}; + +static struct pci_ide_stream_id * +request_stream_id(struct pci_host_bridge *hb, u8 stream_id, + struct pci_ide_stream_id *sid) +{ + if (!reserve_stream_id(hb, stream_id)) + return NULL; + + *sid = (struct pci_ide_stream_id) { + .hb = hb, + .stream_id = stream_id, + }; + + return sid; +} +DEFINE_FREE(free_stream_id, struct pci_ide_stream_id *, + if (_T) ida_free(&_T->hb->ide_stream_ids_ida, _T->stream_id)) + /** * pci_ide_stream_register() - Prepare to activate an IDE Stream * @ide: IDE settings descriptor @@ -313,6 +423,7 @@ int pci_ide_stream_register(struct pci_ide *ide) { struct pci_dev *pdev = ide->pdev; struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus); + struct pci_ide_stream_id __sid; u8 ep_stream, rp_stream; int rc; @@ -321,6 +432,13 @@ int pci_ide_stream_register(struct pci_ide *ide) return -ENXIO; } + struct pci_ide_stream_id *sid __free(free_stream_id) = + request_stream_id(hb, ide->stream_id, &__sid); + if (!sid) { + pci_err(pdev, "Setup fail: Stream ID %d in use\n", ide->stream_id); + return -EBUSY; + } + ep_stream = ide->partner[PCI_IDE_EP].stream_index; rp_stream = ide->partner[PCI_IDE_RP].stream_index; const char *name __free(kfree) = kasprintf(GFP_KERNEL, "stream%d.%d.%d", @@ -335,6 +453,9 @@ int pci_ide_stream_register(struct pci_ide *ide) ide->name = no_free_ptr(name); + /* Stream ID reservation recorded in @ide is now successfully registered */ + retain_and_null_ptr(sid); + return 0; } EXPORT_SYMBOL_GPL(pci_ide_stream_register); @@ -353,6 +474,7 @@ void pci_ide_stream_unregister(struct pci_ide *ide) sysfs_remove_link(&hb->dev.kobj, ide->name); kfree(ide->name); + ida_free(&hb->ide_stream_ids_ida, ide->stream_id); ide->name = NULL; } EXPORT_SYMBOL_GPL(pci_ide_stream_unregister); @@ -616,6 +738,8 @@ void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { hb->nr_ide_streams = 256; ida_init(&hb->ide_stream_ida); + ida_init(&hb->ide_stream_ids_ida); + reserve_stream_id(hb, PCI_IDE_RESERVED_STREAM_ID); } static ssize_t available_secure_streams_show(struct device *dev, @@ -684,3 +808,8 @@ void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr) sysfs_update_group(&hb->dev.kobj, &pci_ide_attr_group); } EXPORT_SYMBOL_NS_GPL(pci_ide_set_nr_streams, "PCI_IDE"); + +void pci_ide_destroy(struct pci_dev *pdev) +{ + ida_destroy(&pdev->ide_stream_ida); +} diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index f6ffe5ee4717..641c0b53c4e3 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -616,10 +616,12 @@ static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { } #ifdef CONFIG_PCI_IDE void pci_ide_init(struct pci_dev *dev); void pci_ide_init_host_bridge(struct pci_host_bridge *hb); +void pci_ide_destroy(struct pci_dev *dev); extern const struct attribute_group pci_ide_attr_group; #else static inline void pci_ide_init(struct pci_dev *dev) { } static inline void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { } +static inline void pci_ide_destroy(struct pci_dev *dev) { } #endif #ifdef CONFIG_PCI_TSM diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 803391892c4a..417a9ea59117 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -70,6 +70,7 @@ static void pci_destroy_dev(struct pci_dev *dev) up_write(&pci_bus_sem); pci_doe_destroy(dev); + pci_ide_destroy(dev); pcie_aspm_exit_link_state(dev); pci_bridge_d3_update(dev); pci_pwrctrl_unregister(&dev->dev); diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index 93194338e4d0..37a1ad9501b0 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -97,6 +97,12 @@ struct pci_ide { struct tsm_dev *tsm_dev; }; +/* + * Some devices need help with aliased stream-ids even for idle streams. Use + * this id as the "never enabled" place holder. + */ +#define PCI_IDE_RESERVED_STREAM_ID 255 + void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr); struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, struct pci_ide *ide); diff --git a/include/linux/pci.h b/include/linux/pci.h index ba39ca78b382..52a235c61023 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -619,6 +619,7 @@ struct pci_host_bridge { #ifdef CONFIG_PCI_IDE u16 nr_ide_streams; /* Max streams possibly active in @ide_stream_ida */ struct ida ide_stream_ida; + struct ida ide_stream_ids_ida; /* track unique ids per domain */ #endif u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ int (*map_irq)(const struct pci_dev *, u8, u8); -- cgit v1.2.3 From 50cbec192f5317e29be993e2a634bbbdfcf0230e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:44 -0800 Subject: PCI/TSM: Add pci_tsm_bind() helper for instantiating TDIs After a PCIe device has established a secure link and session between a TEE Security Manager (TSM) and its local Device Security Manager (DSM), the device or its subfunctions are candidates to be bound to a private memory context, a TVM. A PCIe device function interface assigned to a TVM is a TEE Device Interface (TDI). The pci_tsm_bind() requests the low-level TSM driver to associate the device with private MMIO and private IOMMU context resources of a given TVM represented by a @kvm argument. A device in the bound state corresponds to the TDISP protocol LOCKED state and awaits validation by the TVM. It is a 'struct pci_tsm_link_ops' operation because, similar to IDE establishment, it involves host side resource establishment and context setup on behalf of the guest. It is also expected to be performed lazily to allow for operation of the device in non-confidential "shared" context for pre-lock configuration. Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-7-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/tsm.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++- include/linux/pci-tsm.h | 34 +++++++++++++++ 2 files changed, 142 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/pci/tsm.c b/drivers/pci/tsm.c index 6a2849f77adc..39de91a47a26 100644 --- a/drivers/pci/tsm.c +++ b/drivers/pci/tsm.c @@ -270,6 +270,95 @@ static int remove_fn(struct pci_dev *pdev, void *data) return 0; } +/* + * Note, this helper only returns an error code and takes an argument for + * compatibility with the pci_walk_bus() callback prototype. pci_tsm_unbind() + * always succeeds. + */ +static int __pci_tsm_unbind(struct pci_dev *pdev, void *data) +{ + struct pci_tdi *tdi; + struct pci_tsm_pf0 *tsm_pf0; + + lockdep_assert_held(&pci_tsm_rwsem); + + if (!pdev->tsm) + return 0; + + tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); + guard(mutex)(&tsm_pf0->lock); + + tdi = pdev->tsm->tdi; + if (!tdi) + return 0; + + to_pci_tsm_ops(pdev->tsm)->unbind(tdi); + pdev->tsm->tdi = NULL; + + return 0; +} + +void pci_tsm_unbind(struct pci_dev *pdev) +{ + guard(rwsem_read)(&pci_tsm_rwsem); + __pci_tsm_unbind(pdev, NULL); +} +EXPORT_SYMBOL_GPL(pci_tsm_unbind); + +/** + * pci_tsm_bind() - Bind @pdev as a TDI for @kvm + * @pdev: PCI device function to bind + * @kvm: Private memory attach context + * @tdi_id: Identifier (virtual BDF) for the TDI as referenced by the TSM and DSM + * + * Returns 0 on success, or a negative error code on failure. + * + * Context: Caller is responsible for constraining the bind lifetime to the + * registered state of the device. For example, pci_tsm_bind() / + * pci_tsm_unbind() limited to the VFIO driver bound state of the device. + */ +int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id) +{ + struct pci_tsm_pf0 *tsm_pf0; + struct pci_tdi *tdi; + + if (!kvm) + return -EINVAL; + + guard(rwsem_read)(&pci_tsm_rwsem); + + if (!pdev->tsm) + return -EINVAL; + + if (!is_link_tsm(pdev->tsm->tsm_dev)) + return -ENXIO; + + tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); + guard(mutex)(&tsm_pf0->lock); + + /* Resolve races to bind a TDI */ + if (pdev->tsm->tdi) { + if (pdev->tsm->tdi->kvm != kvm) + return -EBUSY; + return 0; + } + + tdi = to_pci_tsm_ops(pdev->tsm)->bind(pdev, kvm, tdi_id); + if (IS_ERR(tdi)) + return PTR_ERR(tdi); + + pdev->tsm->tdi = tdi; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_tsm_bind); + +static void pci_tsm_unbind_all(struct pci_dev *pdev) +{ + pci_tsm_walk_fns_reverse(pdev, __pci_tsm_unbind, NULL); + __pci_tsm_unbind(pdev, NULL); +} + static void __pci_tsm_disconnect(struct pci_dev *pdev) { struct pci_tsm_pf0 *tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); @@ -278,6 +367,8 @@ static void __pci_tsm_disconnect(struct pci_dev *pdev) /* disconnect() mutually exclusive with subfunction pci_tsm_init() */ lockdep_assert_held_write(&pci_tsm_rwsem); + pci_tsm_unbind_all(pdev); + /* * disconnect() is uninterruptible as it may be called for device * teardown @@ -439,6 +530,22 @@ static struct pci_dev *find_dsm_dev(struct pci_dev *pdev) return NULL; } +/** + * pci_tsm_tdi_constructor() - base 'struct pci_tdi' initialization for link TSMs + * @pdev: PCI device function representing the TDI + * @tdi: context to initialize + * @kvm: Private memory attach context + * @tdi_id: Identifier (virtual BDF) for the TDI as referenced by the TSM and DSM + */ +void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi, + struct kvm *kvm, u32 tdi_id) +{ + tdi->pdev = pdev; + tdi->kvm = kvm; + tdi->tdi_id = tdi_id; +} +EXPORT_SYMBOL_GPL(pci_tsm_tdi_constructor); + /** * pci_tsm_link_constructor() - base 'struct pci_tsm' initialization for link TSMs * @pdev: The PCI device @@ -532,7 +639,7 @@ int pci_tsm_register(struct tsm_dev *tsm_dev) static void pci_tsm_fn_exit(struct pci_dev *pdev) { - /* TODO: unbind the fn */ + __pci_tsm_unbind(pdev, NULL); tsm_remove(pdev->tsm); } diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h index d7b078d5e272..a5e297677917 100644 --- a/include/linux/pci-tsm.h +++ b/include/linux/pci-tsm.h @@ -6,6 +6,8 @@ struct pci_tsm; struct tsm_dev; +struct kvm; +enum pci_tsm_req_scope; /* * struct pci_tsm_ops - manage confidential links and security state @@ -29,12 +31,16 @@ struct pci_tsm_ops { * @connect: establish / validate a secure connection (e.g. IDE) * with the device * @disconnect: teardown the secure link + * @bind: bind a TDI in preparation for it to be accepted by a TVM + * @unbind: remove a TDI from secure operation with a TVM * * Context: @probe, @remove, @connect, and @disconnect run under * pci_tsm_rwsem held for write to sync with TSM unregistration and * mutual exclusion of @connect and @disconnect. @connect and * @disconnect additionally run under the DSM lock (struct * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions. + * @bind and @unbind run under pci_tsm_rwsem held for read and the DSM + * lock. */ struct_group_tagged(pci_tsm_link_ops, link_ops, struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev, @@ -42,6 +48,9 @@ struct pci_tsm_ops { void (*remove)(struct pci_tsm *tsm); int (*connect)(struct pci_dev *pdev); void (*disconnect)(struct pci_dev *pdev); + struct pci_tdi *(*bind)(struct pci_dev *pdev, + struct kvm *kvm, u32 tdi_id); + void (*unbind)(struct pci_tdi *tdi); ); /* @@ -61,12 +70,25 @@ struct pci_tsm_ops { ); }; +/** + * struct pci_tdi - Core TEE I/O Device Interface (TDI) context + * @pdev: host side representation of guest-side TDI + * @kvm: TEE VM context of bound TDI + * @tdi_id: Identifier (virtual BDF) for the TDI as referenced by the TSM and DSM + */ +struct pci_tdi { + struct pci_dev *pdev; + struct kvm *kvm; + u32 tdi_id; +}; + /** * struct pci_tsm - Core TSM context for a given PCIe endpoint * @pdev: Back ref to device function, distinguishes type of pci_tsm context * @dsm_dev: PCI Device Security Manager for link operations on @pdev * @tsm_dev: PCI TEE Security Manager device for Link Confidentiality or Device * Function Security operations + * @tdi: TDI context established by the @bind link operation * * This structure is wrapped by low level TSM driver data and returned by * probe()/lock(), it is freed by the corresponding remove()/unlock(). @@ -82,6 +104,7 @@ struct pci_tsm { struct pci_dev *pdev; struct pci_dev *dsm_dev; struct tsm_dev *tsm_dev; + struct pci_tdi *tdi; }; /** @@ -139,6 +162,10 @@ int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm, void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *tsm); int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req, size_t req_sz, void *resp, size_t resp_sz); +int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id); +void pci_tsm_unbind(struct pci_dev *pdev); +void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi, + struct kvm *kvm, u32 tdi_id); #else static inline int pci_tsm_register(struct tsm_dev *tsm_dev) { @@ -147,5 +174,12 @@ static inline int pci_tsm_register(struct tsm_dev *tsm_dev) static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev) { } +static inline int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u64 tdi_id) +{ + return -ENXIO; +} +static inline void pci_tsm_unbind(struct pci_dev *pdev) +{ +} #endif #endif /*__PCI_TSM_H */ -- cgit v1.2.3 From c316c75d57fbb34e2305690813f4dbec9311f2b0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:45 -0800 Subject: PCI/TSM: Add pci_tsm_guest_req() for managing TDIs A PCIe device function interface assigned to a TVM is a TEE Device Interface (TDI). A TDI instantiated by pci_tsm_bind() needs additional steps taken by the TVM to be accepted into the TVM's Trusted Compute Boundary (TCB) and transitioned to the RUN state. pci_tsm_guest_req() is a channel for the guest to request TDISP collateral, like Device Interface Reports, and effect TDISP state changes, like LOCKED->RUN transititions. Similar to IDE establishment and pci_tsm_bind(), these are long running operations involving SPDM message passing via the DOE mailbox. The path for a TVM to invoke pci_tsm_guest_req() is: * TSM triggers exit via guest-to-host-interface ABI (implementation specific) * VMM invokes handler (KVM handle_exit() -> userspace io) * handler issues request (userspace io handler -> ioctl() -> pci_tsm_guest_req()) * handler supplies response * VMM posts response, notifies/re-enters TVM This path is purely a transport for messages from TVM to platform TSM. By design the host kernel does not and must not care about the content of these messages. I.e. the host kernel is not in the TCB of the TVM. As this is an opaque passthrough interface, similar to fwctl, the kernel requires that implementations stay within the bounds defined by 'enum pci_tsm_req_scope'. Violation of those expectations likely has market and regulatory consequences. Out of scope requests are blocked by default. Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-8-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/tsm.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci-tsm.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 120 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/tsm.c b/drivers/pci/tsm.c index 39de91a47a26..5e57501f693e 100644 --- a/drivers/pci/tsm.c +++ b/drivers/pci/tsm.c @@ -353,6 +353,66 @@ int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id) } EXPORT_SYMBOL_GPL(pci_tsm_bind); +/** + * pci_tsm_guest_req() - helper to marshal guest requests to the TSM driver + * @pdev: @pdev representing a bound tdi + * @scope: caller asserts this passthrough request is limited to TDISP operations + * @req_in: Input payload forwarded from the guest + * @in_len: Length of @req_in + * @req_out: Output payload buffer response to the guest + * @out_len: Length of @req_out on input, bytes filled in @req_out on output + * @tsm_code: Optional TSM arch specific result code for the guest TSM + * + * This is a common entry point for requests triggered by userspace KVM-exit + * service handlers responding to TDI information or state change requests. The + * scope parameter limits requests to TDISP state management, or limited debug. + * This path is only suitable for commands and results that are the host kernel + * has no use, the host is only facilitating guest to TSM communication. + * + * Returns 0 on success and -error on failure and positive "residue" on success + * but @req_out is filled with less then @out_len, or @req_out is NULL and a + * residue number of bytes were not consumed from @req_in. On success or + * failure @tsm_code may be populated with a TSM implementation specific result + * code for the guest to consume. + * + * Context: Caller is responsible for calling this within the pci_tsm_bind() + * state of the TDI. + */ +ssize_t pci_tsm_guest_req(struct pci_dev *pdev, enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, sockptr_t req_out, + size_t out_len, u64 *tsm_code) +{ + struct pci_tsm_pf0 *tsm_pf0; + struct pci_tdi *tdi; + int rc; + + /* Forbid requests that are not directly related to TDISP operations */ + if (scope > PCI_TSM_REQ_STATE_CHANGE) + return -EINVAL; + + ACQUIRE(rwsem_read_intr, lock)(&pci_tsm_rwsem); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &lock))) + return rc; + + if (!pdev->tsm) + return -ENXIO; + + if (!is_link_tsm(pdev->tsm->tsm_dev)) + return -ENXIO; + + tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); + ACQUIRE(mutex_intr, ops_lock)(&tsm_pf0->lock); + if ((rc = ACQUIRE_ERR(mutex_intr, &ops_lock))) + return rc; + + tdi = pdev->tsm->tdi; + if (!tdi) + return -ENXIO; + return to_pci_tsm_ops(pdev->tsm)->guest_req(tdi, scope, req_in, in_len, + req_out, out_len, tsm_code); +} +EXPORT_SYMBOL_GPL(pci_tsm_guest_req); + static void pci_tsm_unbind_all(struct pci_dev *pdev) { pci_tsm_walk_fns_reverse(pdev, __pci_tsm_unbind, NULL); diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h index a5e297677917..a6435aba03f9 100644 --- a/include/linux/pci-tsm.h +++ b/include/linux/pci-tsm.h @@ -3,6 +3,7 @@ #define __PCI_TSM_H #include #include +#include struct pci_tsm; struct tsm_dev; @@ -33,14 +34,15 @@ struct pci_tsm_ops { * @disconnect: teardown the secure link * @bind: bind a TDI in preparation for it to be accepted by a TVM * @unbind: remove a TDI from secure operation with a TVM + * @guest_req: marshal TVM information and state change requests * * Context: @probe, @remove, @connect, and @disconnect run under * pci_tsm_rwsem held for write to sync with TSM unregistration and * mutual exclusion of @connect and @disconnect. @connect and * @disconnect additionally run under the DSM lock (struct * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions. - * @bind and @unbind run under pci_tsm_rwsem held for read and the DSM - * lock. + * @bind, @unbind, and @guest_req run under pci_tsm_rwsem held for read + * and the DSM lock. */ struct_group_tagged(pci_tsm_link_ops, link_ops, struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev, @@ -51,6 +53,11 @@ struct pci_tsm_ops { struct pci_tdi *(*bind)(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id); void (*unbind)(struct pci_tdi *tdi); + ssize_t (*guest_req)(struct pci_tdi *tdi, + enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, + sockptr_t req_out, size_t out_len, + u64 *tsm_code); ); /* @@ -152,6 +159,46 @@ static inline bool is_pci_tsm_pf0(struct pci_dev *pdev) return PCI_FUNC(pdev->devfn) == 0; } +/** + * enum pci_tsm_req_scope - Scope of guest requests to be validated by TSM + * + * Guest requests are a transport for a TVM to communicate with a TSM + DSM for + * a given TDI. A TSM driver is responsible for maintaining the kernel security + * model and limit commands that may affect the host, or are otherwise outside + * the typical TDISP operational model. + */ +enum pci_tsm_req_scope { + /** + * @PCI_TSM_REQ_INFO: Read-only, without side effects, request for + * typical TDISP collateral information like Device Interface Reports. + * No device secrets are permitted, and no device state is changed. + */ + PCI_TSM_REQ_INFO = 0, + /** + * @PCI_TSM_REQ_STATE_CHANGE: Request to change the TDISP state from + * UNLOCKED->LOCKED, LOCKED->RUN, or other architecture specific state + * changes to support those transitions for a TDI. No other (unrelated + * to TDISP) device / host state, configuration, or data change is + * permitted. + */ + PCI_TSM_REQ_STATE_CHANGE = 1, + /** + * @PCI_TSM_REQ_DEBUG_READ: Read-only request for debug information + * + * A method to facilitate TVM information retrieval outside of typical + * TDISP operational requirements. No device secrets are permitted. + */ + PCI_TSM_REQ_DEBUG_READ = 2, + /** + * @PCI_TSM_REQ_DEBUG_WRITE: Device state changes for debug purposes + * + * The request may affect the operational state of the device outside of + * the TDISP operational model. If allowed, requires CAP_SYS_RAW_IO, and + * will taint the kernel. + */ + PCI_TSM_REQ_DEBUG_WRITE = 3, +}; + #ifdef CONFIG_PCI_TSM int pci_tsm_register(struct tsm_dev *tsm_dev); void pci_tsm_unregister(struct tsm_dev *tsm_dev); @@ -166,6 +213,9 @@ int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id); void pci_tsm_unbind(struct pci_dev *pdev); void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi, struct kvm *kvm, u32 tdi_id); +ssize_t pci_tsm_guest_req(struct pci_dev *pdev, enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, sockptr_t req_out, + size_t out_len, u64 *tsm_code); #else static inline int pci_tsm_register(struct tsm_dev *tsm_dev) { @@ -181,5 +231,13 @@ static inline int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u64 tdi_id static inline void pci_tsm_unbind(struct pci_dev *pdev) { } +static inline ssize_t pci_tsm_guest_req(struct pci_dev *pdev, + enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, + sockptr_t req_out, size_t out_len, + u64 *tsm_code) +{ + return -ENXIO; +} #endif #endif /*__PCI_TSM_H */ -- cgit v1.2.3 From 6802d8af47d1dccd9a74a1f708fb9129244ef843 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:04 -0700 Subject: Drivers: hv: VMBus protocol version 6.0 The confidential VMBus is supported starting from the protocol version 6.0 onwards. Provide the required definitions. No functional changes. Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hyperv_vmbus.h | 2 ++ drivers/hv/vmbus_drv.c | 12 ++++++++ include/hyperv/hvgdk_mini.h | 1 + include/linux/hyperv.h | 69 ++++++++++++++++++++++++++++++++------------- 4 files changed, 65 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 0b450e53161e..4a01797d4851 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -333,6 +333,8 @@ extern const struct vmbus_channel_message_table_entry /* General vmbus interface */ +bool vmbus_is_confidential(void); + struct hv_device *vmbus_device_create(const guid_t *type, const guid_t *instance, struct vmbus_channel *channel); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 69591dc7bad2..3c414560fa5f 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -56,6 +56,18 @@ static long __percpu *vmbus_evt; int vmbus_irq; int vmbus_interrupt; +/* + * If the Confidential VMBus is used, the data on the "wire" is not + * visible to either the host or the hypervisor. + */ +static bool is_confidential; + +bool vmbus_is_confidential(void) +{ + return is_confidential; +} +EXPORT_SYMBOL_GPL(vmbus_is_confidential); + /* * The panic notifier below is responsible solely for unloading the * vmbus connection, which is necessary in a panic event. diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 77abddfc750e..7f730a0e54e6 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -260,6 +260,7 @@ union hv_hypervisor_version_info { #define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082 /* Support for the extended IOAPIC RTE format */ #define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2) +#define HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE BIT(3) #define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 #define HYPERV_CPUID_MIN 0x40000005 diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 59826c89171c..dfc516c1c719 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -265,16 +265,18 @@ static inline u32 hv_get_avail_to_write_percent( * Linux kernel. */ -#define VERSION_WS2008 ((0 << 16) | (13)) -#define VERSION_WIN7 ((1 << 16) | (1)) -#define VERSION_WIN8 ((2 << 16) | (4)) -#define VERSION_WIN8_1 ((3 << 16) | (0)) -#define VERSION_WIN10 ((4 << 16) | (0)) -#define VERSION_WIN10_V4_1 ((4 << 16) | (1)) -#define VERSION_WIN10_V5 ((5 << 16) | (0)) -#define VERSION_WIN10_V5_1 ((5 << 16) | (1)) -#define VERSION_WIN10_V5_2 ((5 << 16) | (2)) -#define VERSION_WIN10_V5_3 ((5 << 16) | (3)) +#define VMBUS_MAKE_VERSION(MAJ, MIN) ((((u32)MAJ) << 16) | (MIN)) +#define VERSION_WS2008 VMBUS_MAKE_VERSION(0, 13) +#define VERSION_WIN7 VMBUS_MAKE_VERSION(1, 1) +#define VERSION_WIN8 VMBUS_MAKE_VERSION(2, 4) +#define VERSION_WIN8_1 VMBUS_MAKE_VERSION(3, 0) +#define VERSION_WIN10 VMBUS_MAKE_VERSION(4, 0) +#define VERSION_WIN10_V4_1 VMBUS_MAKE_VERSION(4, 1) +#define VERSION_WIN10_V5 VMBUS_MAKE_VERSION(5, 0) +#define VERSION_WIN10_V5_1 VMBUS_MAKE_VERSION(5, 1) +#define VERSION_WIN10_V5_2 VMBUS_MAKE_VERSION(5, 2) +#define VERSION_WIN10_V5_3 VMBUS_MAKE_VERSION(5, 3) +#define VERSION_WIN10_V6_0 VMBUS_MAKE_VERSION(6, 0) /* Make maximum size of pipe payload of 16K */ #define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384) @@ -335,14 +337,22 @@ struct vmbus_channel_offer { } __packed; /* Server Flags */ -#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 1 -#define VMBUS_CHANNEL_SERVER_SUPPORTS_TRANSFER_PAGES 2 -#define VMBUS_CHANNEL_SERVER_SUPPORTS_GPADLS 4 -#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x10 -#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x100 -#define VMBUS_CHANNEL_PARENT_OFFER 0x200 -#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x400 -#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 +#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 0x0001 +/* + * This flag indicates that the channel is offered by the paravisor, and must + * use encrypted memory for the channel ring buffer. + */ +#define VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER 0x0002 +/* + * This flag indicates that the channel is offered by the paravisor, and must + * use encrypted memory for GPA direct packets and additional GPADLs. + */ +#define VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY 0x0004 +#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x0010 +#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x0100 +#define VMBUS_CHANNEL_PARENT_OFFER 0x0200 +#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x0400 +#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 struct vmpacket_descriptor { u16 type; @@ -621,6 +631,12 @@ struct vmbus_channel_relid_released { u32 child_relid; } __packed; +/* + * Used by the paravisor only, means that the encrypted ring buffers and + * the encrypted external memory are supported + */ +#define VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS 0x10 + struct vmbus_channel_initiate_contact { struct vmbus_channel_message_header header; u32 vmbus_version_requested; @@ -630,7 +646,8 @@ struct vmbus_channel_initiate_contact { struct { u8 msg_sint; u8 msg_vtl; - u8 reserved[6]; + u8 reserved[2]; + u32 feature_flags; /* VMBus version 6.0 */ }; }; u64 monitor_page1; @@ -1003,6 +1020,10 @@ struct vmbus_channel { /* boolean to control visibility of sysfs for ring buffer */ bool ring_sysfs_visible; + /* The ring buffer is encrypted */ + bool co_ring_buffer; + /* The external memory is encrypted */ + bool co_external_memory; }; #define lock_requestor(channel, flags) \ @@ -1027,6 +1048,16 @@ u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, u64 rqst_addr); u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id); +static inline bool is_co_ring_buffer(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER); +} + +static inline bool is_co_external_memory(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY); +} + static inline bool is_hvsock_offer(const struct vmbus_channel_offer_channel *o) { return !!(o->offer.chn_flags & VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); -- cgit v1.2.3 From 796ef5a7fe86a8605f2844471ed7baa8e80bace8 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Thu, 13 Nov 2025 04:41:47 +0000 Subject: static_call: allow using STATIC_CALL_TRAMP_STR() from assembly STATIC_CALL_TRAMP_STR() could not be used from .S files because static_call_types.h was not safe to include in assembly as it pulled in C types/constructs that are unavailable under __ASSEMBLY__. Make the header assembly-friendly by adding __ASSEMBLY__ checks and providing only the minimal definitions needed for assembly, so that it can be safely included by .S code. This enables emitting the static call trampoline symbol name via STATIC_CALL_TRAMP_STR() directly in assembly sources, to be used with 'call' instruction. Also, move a certain definitions out of __ASSEMBLY__ checks in compiler_types.h to meet the dependencies. No functional change for C compilation. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Naman Jain Signed-off-by: Wei Liu --- include/linux/compiler_types.h | 8 ++++---- include/linux/static_call_types.h | 4 ++++ tools/include/linux/static_call_types.h | 4 ++++ 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 59288a2c1ad2..6897d4d5cb28 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -11,6 +11,10 @@ #define __has_builtin(x) (0) #endif +/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ +#define ___PASTE(a, b) a##b +#define __PASTE(a, b) ___PASTE(a, b) + #ifndef __ASSEMBLY__ /* @@ -79,10 +83,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __builtin_warning(x, y...) (1) #endif /* __CHECKER__ */ -/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ -#define ___PASTE(a,b) a##b -#define __PASTE(a,b) ___PASTE(a,b) - #ifdef __KERNEL__ /* Attributes */ diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h index 5a00b8b2cf9f..cfb6ddeb292b 100644 --- a/include/linux/static_call_types.h +++ b/include/linux/static_call_types.h @@ -25,6 +25,8 @@ #define STATIC_CALL_SITE_INIT 2UL /* init section */ #define STATIC_CALL_SITE_FLAGS 3UL +#ifndef __ASSEMBLY__ + /* * The static call site table needs to be created by external tooling (objtool * or a compiler plugin). @@ -100,4 +102,6 @@ struct static_call_key { #endif /* CONFIG_HAVE_STATIC_CALL */ +#endif /* __ASSEMBLY__ */ + #endif /* _STATIC_CALL_TYPES_H */ diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h index 5a00b8b2cf9f..cfb6ddeb292b 100644 --- a/tools/include/linux/static_call_types.h +++ b/tools/include/linux/static_call_types.h @@ -25,6 +25,8 @@ #define STATIC_CALL_SITE_INIT 2UL /* init section */ #define STATIC_CALL_SITE_FLAGS 3UL +#ifndef __ASSEMBLY__ + /* * The static call site table needs to be created by external tooling (objtool * or a compiler plugin). @@ -100,4 +102,6 @@ struct static_call_key { #endif /* CONFIG_HAVE_STATIC_CALL */ +#endif /* __ASSEMBLY__ */ + #endif /* _STATIC_CALL_TYPES_H */ -- cgit v1.2.3 From 39231e8d6ba7f794b566fd91ebd88c0834a23b98 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Fri, 14 Nov 2025 22:49:20 +0100 Subject: mm: fix MAX_FOLIO_ORDER on powerpc configs with hugetlb In the past, CONFIG_ARCH_HAS_GIGANTIC_PAGE indicated that we support runtime allocation of gigantic hugetlb folios. In the meantime it evolved into a generic way for the architecture to state that it supports gigantic hugetlb folios. In commit fae7d834c43c ("mm: add __dump_folio()") we started using CONFIG_ARCH_HAS_GIGANTIC_PAGE to decide MAX_FOLIO_ORDER: whether we could have folios larger than what the buddy can handle. In the context of that commit, we started using MAX_FOLIO_ORDER to detect page corruptions when dumping tail pages of folios. Before that commit, we assumed that we cannot have folios larger than the highest buddy order, which was obviously wrong. In commit 7b4f21f5e038 ("mm/hugetlb: check for unreasonable folio sizes when registering hstate"), we used MAX_FOLIO_ORDER to detect inconsistencies, and in fact, we found some now. Powerpc allows for configs that can allocate gigantic folio during boot (not at runtime), that do not set CONFIG_ARCH_HAS_GIGANTIC_PAGE and can exceed PUD_ORDER. To fix it, let's make powerpc select CONFIG_ARCH_HAS_GIGANTIC_PAGE with hugetlb on powerpc, and increase the maximum folio size with hugetlb to 16 GiB on 64bit (possible on arm64 and powerpc) and 1 GiB on 32 bit (powerpc). Note that on some powerpc configurations, whether we actually have gigantic pages depends on the setting of CONFIG_ARCH_FORCE_MAX_ORDER, but there is nothing really problematic about setting it unconditionally: we just try to keep the value small so we can better detect problems in __dump_folio() and inconsistencies around the expected largest folio in the system. Ideally, we'd have a better way to obtain the maximum hugetlb folio size and detect ourselves whether we really end up with gigantic folios. Let's defer bigger changes and fix the warnings first. While at it, handle gigantic DAX folios more clearly: DAX can only end up creating gigantic folios with HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD. Add a new Kconfig option HAVE_GIGANTIC_FOLIOS to make both cases clearer. In particular, worry about ARCH_HAS_GIGANTIC_PAGE only with HUGETLB_PAGE. Note: with enabling CONFIG_ARCH_HAS_GIGANTIC_PAGE on powerpc, we will now also allow for runtime allocations of folios in some more powerpc configs. I don't think this is a problem, but if it is we could handle it through __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED. While __dump_page()/__dump_folio was also problematic (not handling dumping of tail pages of such gigantic folios correctly), it doesn't seem critical enough to mark it as a fix. Link: https://lkml.kernel.org/r/20251114214920.2550676-1-david@kernel.org Fixes: 7b4f21f5e038 ("mm/hugetlb: check for unreasonable folio sizes when registering hstate") Reported-by: Christophe Leroy Closes: https://lore.kernel.org/r/3e043453-3f27-48ad-b987-cc39f523060a@csgroup.eu/ Reported-by: Sourabh Jain Closes: https://lore.kernel.org/r/94377f5c-d4f0-4c0f-b0f6-5bf1cd7305b1@linux.ibm.com/ Signed-off-by: David Hildenbrand (Red Hat) Cc: Ritesh Harjani (IBM) Cc: Madhavan Srinivasan Cc: Donet Tom Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Lorenzo Stoakes Cc: "Liam R. Howlett" Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/platforms/Kconfig.cputype | 1 - include/linux/mm.h | 13 ++++++++++--- mm/Kconfig | 7 +++++++ 4 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e24f4d88885a..9537a61ebae0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,6 +137,7 @@ config PPC select ARCH_HAS_DMA_OPS if PPC64 select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_GIGANTIC_PAGE if ARCH_SUPPORTS_HUGETLBFS select ARCH_HAS_KCOV select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC64 && PPC_FPU select ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7b527d18aa5e..4c321a8ea896 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -423,7 +423,6 @@ config PPC_64S_HASH_MMU config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 - select ARCH_HAS_GIGANTIC_PAGE default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..7c79b3369b82 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2074,7 +2074,7 @@ static inline unsigned long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) +#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) /* * We don't expect any folios that exceed buddy sizes (and consequently * memory sections). @@ -2087,10 +2087,17 @@ static inline unsigned long folio_nr_pages(const struct folio *folio) * pages are guaranteed to be contiguous. */ #define MAX_FOLIO_ORDER PFN_SECTION_SHIFT -#else +#elif defined(CONFIG_HUGETLB_PAGE) /* * There is no real limit on the folio size. We limit them to the maximum we - * currently expect (e.g., hugetlb, dax). + * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect + * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. + */ +#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#else +/* + * Without hugetlb, gigantic folios that are bigger than a single PUD are + * currently impossible. */ #define MAX_FOLIO_ORDER PUD_ORDER #endif diff --git a/mm/Kconfig b/mm/Kconfig index 0e26f4fc8717..ca3f146bc705 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -908,6 +908,13 @@ config PAGE_MAPCOUNT config PGTABLE_HAS_HUGE_LEAVES def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE +# +# We can end up creating gigantic folio. +# +config HAVE_GIGANTIC_FOLIOS + def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \ + (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + # TODO: Allow to be enabled without THP config ARCH_SUPPORTS_HUGE_PFNMAP def_bool n -- cgit v1.2.3 From 4051a9115ad24bb9a691774730ca9c1dd56de665 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Sep 2025 22:19:10 -0400 Subject: new helper: simple_remove_by_name() simple_recursive_removal(), but instead of victim dentry it takes parent + name. Used to be open-coded in fs/fuse/control.c, but there's no need to expose the guts of that thing there and there are other potential users, so let's lift it into libfs... Acked-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/fuse/control.c | 7 +------ fs/libfs.c | 13 +++++++++++++ include/linux/fs.h | 2 ++ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 5247df896c5d..3dca752127ff 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -290,18 +290,13 @@ static void remove_one(struct dentry *dentry) */ void fuse_ctl_remove_conn(struct fuse_conn *fc) { - struct dentry *dentry; char name[32]; if (!fuse_control_sb || fc->no_control) return; sprintf(name, "%u", fc->dev); - dentry = lookup_noperm_positive_unlocked(&QSTR(name), fuse_control_sb->s_root); - if (!IS_ERR(dentry)) { - simple_recursive_removal(dentry, remove_one); - dput(dentry); // paired with lookup_noperm_positive_unlocked() - } + simple_remove_by_name(fuse_control_sb->s_root, name, remove_one); } static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) diff --git a/fs/libfs.c b/fs/libfs.c index ce8c496a6940..d029aff41f66 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -655,6 +655,19 @@ void simple_recursive_removal(struct dentry *dentry, } EXPORT_SYMBOL(simple_recursive_removal); +void simple_remove_by_name(struct dentry *parent, const char *name, + void (*callback)(struct dentry *)) +{ + struct dentry *dentry; + + dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent); + if (!IS_ERR(dentry)) { + simple_recursive_removal(dentry, callback); + dput(dentry); // paired with lookup_noperm_positive_unlocked() + } +} +EXPORT_SYMBOL(simple_remove_by_name); + /* caller holds parent directory with I_MUTEX_PARENT */ void locked_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *)) diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..28bd4e8d3892 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3631,6 +3631,8 @@ extern int simple_rename(struct mnt_idmap *, struct inode *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); +extern void simple_remove_by_name(struct dentry *, const char *, + void (*callback)(struct dentry *)); extern void locked_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); -- cgit v1.2.3 From 1552ddc7fade1ae55af298580ef6c913b8db74bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Sep 2025 17:46:01 -0400 Subject: new helper: simple_done_creating() should be paired with simple_start_creating() - unlocks parent and drops dentry reference. Signed-off-by: Al Viro --- fs/libfs.c | 8 ++++++++ include/linux/fs.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/fs/libfs.c b/fs/libfs.c index d029aff41f66..a033f35493d0 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2326,3 +2326,11 @@ struct dentry *simple_start_creating(struct dentry *parent, const char *name) return dentry; } EXPORT_SYMBOL(simple_start_creating); + +/* parent must have been held exclusive since simple_start_creating() */ +void simple_done_creating(struct dentry *child) +{ + inode_unlock(child->d_parent->d_inode); + dput(child); +} +EXPORT_SYMBOL(simple_done_creating); diff --git a/include/linux/fs.h b/include/linux/fs.h index 28bd4e8d3892..f5037c556f61 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3662,6 +3662,7 @@ extern int simple_fill_super(struct super_block *, unsigned long, extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); struct dentry *simple_start_creating(struct dentry *, const char *); +void simple_done_creating(struct dentry *); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); -- cgit v1.2.3 From 8a210cacf5dc2a6210ee42aeca5cd03b2400876f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Mar 2025 19:15:35 -0500 Subject: introduce a flag for explicitly marking persistently pinned dentries Some filesystems use a kinda-sorta controlled dentry refcount leak to pin dentries of created objects in dcache (and undo it when removing those). Reference is grabbed and not released, but it's not actually _stored_ anywhere. That works, but it's hard to follow and verify; among other things, we have no way to tell _which_ of the increments is intended to be an unpaired one. Worse, on removal we need to decide whether the reference had already been dropped, which can be non-trivial if that removal is on umount and we need to figure out if this dentry is pinned due to e.g. unlink() not done. Usually that is handled by using kill_litter_super() as ->kill_sb(), but there are open-coded special cases of the same (consider e.g. /proc/self). Things get simpler if we introduce a new dentry flag (DCACHE_PERSISTENT) marking those "leaked" dentries. Having it set claims responsibility for +1 in refcount. The end result this series is aiming for: * get these unbalanced dget() and dput() replaced with new primitives that would, in addition to adjusting refcount, set and clear persistency flag. * instead of having kill_litter_super() mess with removing the remaining "leaked" references (e.g. for all tmpfs files that hadn't been removed prior to umount), have the regular shrink_dcache_for_umount() strip DCACHE_PERSISTENT of all dentries, dropping the corresponding reference if it had been set. After that kill_litter_super() becomes an equivalent of kill_anon_super(). Doing that in a single step is not feasible - it would affect too many places in too many filesystems. It has to be split into a series. Here we * introduce the new flag * teach shrink_dcache_for_umount() to handle it (i.e. remove and drop refcount on anything that survives to umount with that flag still set) * teach kill_litter_super() that anything with that flag does *not* need to be unpinned. Next commits will add primitives for maintaing that flag and convert the common helpers to those. After that - a long series of per-filesystem patches converting to those primitives. Signed-off-by: Al Viro --- fs/dcache.c | 27 ++++++++++++++++++++++----- include/linux/dcache.h | 1 + 2 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 035cccbc9276..f2c9f4fef2a2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1511,6 +1511,15 @@ out: return ret; } +static enum d_walk_ret select_collect_umount(void *_data, struct dentry *dentry) +{ + if (dentry->d_flags & DCACHE_PERSISTENT) { + dentry->d_flags &= ~DCACHE_PERSISTENT; + dentry->d_lockref.count--; + } + return select_collect(_data, dentry); +} + static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; @@ -1539,18 +1548,20 @@ out: } /** - * shrink_dcache_parent - prune dcache + * shrink_dcache_tree - prune dcache * @parent: parent of entries to prune + * @for_umount: true if we want to unpin the persistent ones * * Prune the dcache to remove unused children of the parent dentry. */ -void shrink_dcache_parent(struct dentry *parent) +static void shrink_dcache_tree(struct dentry *parent, bool for_umount) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); - d_walk(parent, &data, select_collect); + d_walk(parent, &data, + for_umount ? select_collect_umount : select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); @@ -1575,6 +1586,11 @@ void shrink_dcache_parent(struct dentry *parent) shrink_dentry_list(&data.dispose); } } + +void shrink_dcache_parent(struct dentry *parent) +{ + shrink_dcache_tree(parent, false); +} EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) @@ -1601,7 +1617,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) static void do_one_tree(struct dentry *dentry) { - shrink_dcache_parent(dentry); + shrink_dcache_tree(dentry, true); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); @@ -3111,7 +3127,8 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { struct dentry *root = data; if (dentry != root) { - if (d_unhashed(dentry) || !dentry->d_inode) + if (d_unhashed(dentry) || !dentry->d_inode || + dentry->d_flags & DCACHE_PERSISTENT) return D_WALK_SKIP; if (!(dentry->d_flags & DCACHE_GENOCIDE)) { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c83e02b94389..94b58655322a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -225,6 +225,7 @@ enum dentry_flags { DCACHE_PAR_LOOKUP = BIT(24), /* being looked up (with parent locked shared) */ DCACHE_DENTRY_CURSOR = BIT(25), DCACHE_NORCU = BIT(26), /* No RCU delay for freeing */ + DCACHE_PERSISTENT = BIT(27) }; #define DCACHE_MANAGED_DENTRY \ -- cgit v1.2.3 From bacdf1d70bbe2027619c7bbbe48b379a806a9678 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Mar 2025 19:38:04 -0500 Subject: primitives for maintaining persisitency * d_make_persistent(dentry, inode) - bump refcount, mark persistent and make hashed positive. Return value is a borrowed reference to dentry; it can be used until something removes persistency (at the very least, until the parent gets unlocked, but some filesystems may have stronger exclusion). * d_make_discardable() - remove persistency mark and drop reference. d_make_persistent() is similar to combination of d_instantiate(), dget() and setting flag. The only difference is that unlike d_instantiate() it accepts hashed and unhashed negatives alike. It is always called in strong locking environment (parent held exclusive, or, in some cases, dentry coming from d_alloc_name()); if we ever start using it with parent held only shared and dentry coming from d_alloc_parallel(), we'll need to copy the in-lookup logics from __d_add(). d_make_discardable() is eqiuvalent to combination of removing flag and dput(); since flag removal requires ->d_lock, there's no point trying to avoid taking that for refcount decrement as fast_dput() does. The slow path of dput() has been taken into a helper and reused in d_make_discardable() instead. Signed-off-by: Al Viro --- fs/dcache.c | 74 ++++++++++++++++++++++++++++++++++++++++---------- include/linux/dcache.h | 2 ++ 2 files changed, 61 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index f2c9f4fef2a2..3cc6c3876177 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -869,6 +869,24 @@ locked: return false; } +static void finish_dput(struct dentry *dentry) + __releases(dentry->d_lock) + __releases(RCU) +{ + while (lock_for_kill(dentry)) { + rcu_read_unlock(); + dentry = __dentry_kill(dentry); + if (!dentry) + return; + if (retain_dentry(dentry, true)) { + spin_unlock(&dentry->d_lock); + return; + } + rcu_read_lock(); + } + rcu_read_unlock(); + spin_unlock(&dentry->d_lock); +} /* * This is dput @@ -906,22 +924,28 @@ void dput(struct dentry *dentry) rcu_read_unlock(); return; } - while (lock_for_kill(dentry)) { - rcu_read_unlock(); - dentry = __dentry_kill(dentry); - if (!dentry) - return; - if (retain_dentry(dentry, true)) { - spin_unlock(&dentry->d_lock); - return; - } - rcu_read_lock(); - } - rcu_read_unlock(); - spin_unlock(&dentry->d_lock); + finish_dput(dentry); } EXPORT_SYMBOL(dput); +void d_make_discardable(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + /* + * By the end of the series we'll add + * WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT); + * here, but while object removal is done by a few common helpers, + * object creation tends to be open-coded (if nothing else, new inode + * needs to be set up), so adding a warning from the very beginning + * would make for much messier patch series. + */ + dentry->d_flags &= ~DCACHE_PERSISTENT; + dentry->d_lockref.count--; + rcu_read_lock(); + finish_dput(dentry); +} +EXPORT_SYMBOL(d_make_discardable); + static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { @@ -1939,7 +1963,6 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); - spin_lock(&dentry->d_lock); /* * The negative counter only tracks dentries on the LRU. Don't dec if * d_lru is on another list. @@ -1952,7 +1975,6 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); - spin_unlock(&dentry->d_lock); } /** @@ -1976,7 +1998,9 @@ void d_instantiate(struct dentry *entry, struct inode * inode) if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); + spin_lock(&entry->d_lock); __d_instantiate(entry, inode); + spin_unlock(&entry->d_lock); spin_unlock(&inode->i_lock); } } @@ -1995,7 +2019,9 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); + spin_lock(&entry->d_lock); __d_instantiate(entry, inode); + spin_unlock(&entry->d_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; /* @@ -2754,6 +2780,24 @@ void d_add(struct dentry *entry, struct inode *inode) } EXPORT_SYMBOL(d_add); +struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode) +{ + WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + WARN_ON(!inode); + security_d_instantiate(dentry, inode); + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); + __d_instantiate(dentry, inode); + dentry->d_flags |= DCACHE_PERSISTENT; + dget_dlock(dentry); + if (d_unhashed(dentry)) + __d_rehash(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + return dentry; +} +EXPORT_SYMBOL(d_make_persistent); + static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 94b58655322a..6ec4066825e3 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -611,5 +611,7 @@ static inline struct dentry *d_next_sibling(const struct dentry *dentry) } void set_default_d_op(struct super_block *, const struct dentry_operations *); +struct dentry *d_make_persistent(struct dentry *, struct inode *); +void d_make_discardable(struct dentry *dentry); #endif /* __LINUX_DCACHE_H */ -- cgit v1.2.3 From 23cbc7a795853bc7a8d0512b7c686ef879f6e909 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 26 Feb 2024 01:55:36 -0500 Subject: procfs: make /self and /thread_self dentries persistent ... and there's no need to remember those pointers anywhere - ->kill_sb() no longer needs to bother since kill_anon_super() will take care of them anyway and proc_pid_readdir() only wants the inumbers, which we had in a couple of static variables all along. Signed-off-by: Al Viro --- fs/proc/base.c | 6 ++---- fs/proc/internal.h | 1 + fs/proc/root.c | 14 ++++---------- fs/proc/self.c | 10 +++------- fs/proc/thread_self.c | 11 +++-------- include/linux/proc_fs.h | 2 -- 6 files changed, 13 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6299878e3d97..869677a26332 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3585,14 +3585,12 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; if (pos == TGID_OFFSET - 2) { - struct inode *inode = d_inode(fs_info->proc_self); - if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) + if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { - struct inode *inode = d_inode(fs_info->proc_thread_self); - if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) + if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d1598576506c..c1e8eb984da8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -373,6 +373,7 @@ static inline void proc_tty_init(void) {} extern struct proc_dir_entry proc_root; extern void proc_self_init(void); +extern unsigned self_inum, thread_self_inum; /* * task_[no]mmu.c diff --git a/fs/proc/root.c b/fs/proc/root.c index 1e24e085c7d5..d8ca41d823e4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -347,17 +347,11 @@ static void proc_kill_sb(struct super_block *sb) { struct proc_fs_info *fs_info = proc_sb_info(sb); - if (!fs_info) { - kill_anon_super(sb); - return; - } - - dput(fs_info->proc_self); - dput(fs_info->proc_thread_self); - kill_anon_super(sb); - put_pid_ns(fs_info->pid_ns); - kfree_rcu(fs_info, rcu); + if (fs_info) { + put_pid_ns(fs_info->pid_ns); + kfree_rcu(fs_info, rcu); + } } static struct file_system_type proc_fs_type = { diff --git a/fs/proc/self.c b/fs/proc/self.c index b46fbfd22681..62d2c0cfe35c 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -31,12 +31,11 @@ static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; -static unsigned self_inum __ro_after_init; +unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *self; int ret = -ENOMEM; @@ -51,18 +50,15 @@ int proc_setup_self(struct super_block *s) inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; - d_add(self, inode); + d_make_persistent(self, inode); ret = 0; - } else { - dput(self); } + dput(self); } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); - else - fs_info->proc_self = self; return ret; } diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 0e5050d6ab64..d6113dbe58e0 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -31,12 +31,11 @@ static const struct inode_operations proc_thread_self_inode_operations = { .get_link = proc_thread_self_get_link, }; -static unsigned thread_self_inum __ro_after_init; +unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *thread_self; int ret = -ENOMEM; @@ -51,19 +50,15 @@ int proc_setup_thread_self(struct super_block *s) inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_thread_self_inode_operations; - d_add(thread_self, inode); + d_make_persistent(thread_self, inode); ret = 0; - } else { - dput(thread_self); } + dput(thread_self); } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/thread-self\n"); - else - fs_info->proc_thread_self = thread_self; - return ret; } diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index f139377f4b31..19d1c5e5f335 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -66,8 +66,6 @@ enum proc_pidonly { struct proc_fs_info { struct pid_namespace *pid_ns; - struct dentry *proc_self; /* For /proc/self */ - struct dentry *proc_thread_self; /* For /proc/thread-self */ kgid_t pid_gid; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; -- cgit v1.2.3 From 566a414558aec1ab263ab8709fa783dfa2e34325 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 2 Oct 2025 10:00:52 -0400 Subject: svcrdma: Increase the server's default RPC/RDMA credit grant The range of commits from commit e3274026e2ec ("SUNRPC: move all of xprt handling into svc_xprt_handle()") to commit 15d39883ee7d ("SUNRPC: change the back-channel queue to lwq") enabled NFSD performance to scale better as the number of nfsd threads is increased. These commits were merged in v6.7. Now that the nfsd thread count can scale to more threads, permit individual clients to make more use of those threads. Increase the RPC/RDMA per-connection credit grant from 64 to 128 -- same as the Linux NFS client. Simple single client fio-based benchmarking so far shows only improvement, no regression. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 22704c2e5b9b..57f4fd94166a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -131,7 +131,7 @@ static inline struct svcxprt_rdma *svc_rdma_rqst_rdma(struct svc_rqst *rqstp) */ enum { RPCRDMA_LISTEN_BACKLOG = 10, - RPCRDMA_MAX_REQUESTS = 64, + RPCRDMA_MAX_REQUESTS = 128, RPCRDMA_MAX_BC_REQUESTS = 2, }; -- cgit v1.2.3 From 6b3b697d65d46a0f640216a3f6c72856c159c567 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 13 Oct 2025 09:54:53 -0400 Subject: sunrpc: allocate a separate bvec array for socket sends svc_tcp_sendmsg() calls xdr_buf_to_bvec() with the second slot of rq_bvec as the start, but doesn't reduce the array length by one, which could lead to an array overrun. Also, rq_bvec is always rq_maxpages in length, which can be too short in some cases, since the TCP record marker consumes a slot. Fix both problems by adding a separate bvec array to the svc_sock that is specifically for sending. For TCP, make this array one slot longer than rq_maxpages, to account for the record marker. For UDP, only allocate as large an array as we need since it's limited to 64k of payload. Signed-off-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svcsock.h | 3 +++ net/sunrpc/svcsock.c | 55 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 51 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 963bbe251e52..de37069aba90 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -26,6 +26,9 @@ struct svc_sock { void (*sk_odata)(struct sock *); void (*sk_owspace)(struct sock *); + /* For sends (protected by xpt_mutex) */ + struct bio_vec *sk_bvec; + /* private TCP part */ /* On-the-wire fragment header: */ __be32 sk_marker; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 0cb9c4d45745..93de79020a2d 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -68,6 +68,17 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +/* + * For UDP: + * 1 for header page + * enough pages for RPCSVC_MAXPAYLOAD_UDP + * 1 in case payload is not aligned + * 1 for tail page + */ +enum { + SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1 +}; + /* To-do: to avoid tying up an nfsd thread while waiting for a * handshake request, the request could instead be deferred. */ @@ -740,14 +751,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - count = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, xdr); + count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); } @@ -1236,19 +1247,19 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, int ret; /* The stream record marker is copied into a temporary page - * fragment buffer so that it can be included in rq_bvec. + * fragment buffer so that it can be included in sk_bvec. */ buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker), GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, &marker, sizeof(marker)); - bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker)); + bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker)); - count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, rqstp->rq_maxpages, + count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages, &rqstp->rq_res); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); page_frag_free(buf); @@ -1393,6 +1404,20 @@ void svc_sock_update_bufs(struct svc_serv *serv) spin_unlock_bh(&serv->sv_lock); } +static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags) +{ + switch (sock->type) { + case SOCK_STREAM: + /* +1 for TCP record marker */ + if (flags & SVC_SOCK_TEMPORARY) + return svc_serv_maxpages(serv) + 1; + return 0; + case SOCK_DGRAM: + return SUNRPC_MAX_UDP_SENDPAGES; + } + return -EINVAL; +} + /* * Initialize socket for RPC use and create svc_sock struct */ @@ -1403,12 +1428,26 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int sendpages; unsigned long pages; + sendpages = svc_sock_sendpages(serv, sock, flags); + if (sendpages < 0) + return ERR_PTR(sendpages); + pages = svc_serv_maxpages(serv); svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); + + if (sendpages) { + svsk->sk_bvec = kcalloc(sendpages, sizeof(*svsk->sk_bvec), GFP_KERNEL); + if (!svsk->sk_bvec) { + kfree(svsk); + return ERR_PTR(-ENOMEM); + } + } + svsk->sk_maxpages = pages; inet = sock->sk; @@ -1420,6 +1459,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); if (err < 0) { + kfree(svsk->sk_bvec); kfree(svsk); return ERR_PTR(err); } @@ -1637,5 +1677,6 @@ static void svc_sock_free(struct svc_xprt *xprt) sock_release(sock); page_frag_cache_drain(&svsk->sk_frag_cache); + kfree(svsk->sk_bvec); kfree(svsk); } -- cgit v1.2.3 From 37d17925480404f1293f24d027fbf3c9975603d7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 29 Sep 2025 11:46:43 +0100 Subject: mm/thp: drop follow_devmap_pmd() default stub follow_devmap_pmd() has already been dropped by the commit fd2825b0760a ("mm/gup: remove pXX_devmap usage from get_user_pages()"). The fallback stub in the header which is now redundant, can be dropped off as well. Link: https://lkml.kernel.org/r/20250929104643.1100421-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Dev Jain Reviewed-by: Alistair Popple Reviewed-by: Wei Yang Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 71ac78b9f834..fee4cf7fa300 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -682,12 +682,6 @@ static inline void mm_put_huge_zero_folio(struct mm_struct *mm) return; } -static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) -{ - return NULL; -} - static inline bool thp_migration_supported(void) { return false; -- cgit v1.2.3 From 9c47753167a6a585d0305663c6912f042e131c2d Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:29 +0200 Subject: mm/vmalloc: defer freeing partly initialized vm_struct __vmalloc_area_node() may call free_vmap_area() or vfree() on error paths, both of which can sleep. This becomes problematic if the function is invoked from an atomic context, such as when GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. To fix this, unify error paths and defer the cleanup of partly initialized vm_struct objects to a workqueue. This ensures that freeing happens in a process context and avoids invalid sleeps in atomic regions. Link: https://lkml.kernel.org/r/20251007122035.56347-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 6 +++++- mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index eb54b7b3202f..1e43181369f1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ #endif struct vm_struct { - struct vm_struct *next; + union { + struct vm_struct *next; /* Early registration of vm_areas. */ + struct llist_node llnode; /* Asynchronous freeing on error paths. */ + }; + void *addr; unsigned long size; unsigned long flags; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d83c01caaabe..9e29dd767c41 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3687,6 +3687,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, return nr_allocated; } +static LLIST_HEAD(pending_vm_area_cleanup); +static void cleanup_vm_area_work(struct work_struct *work) +{ + struct vm_struct *area, *tmp; + struct llist_node *head; + + head = llist_del_all(&pending_vm_area_cleanup); + if (!head) + return; + + llist_for_each_entry_safe(area, tmp, head, llnode) { + if (!area->pages) + free_vm_area(area); + else + vfree(area->addr); + } +} + +/* + * Helper for __vmalloc_area_node() to defer cleanup + * of partially initialized vm_struct in error paths. + */ +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); +static void defer_vm_area_cleanup(struct vm_struct *area) +{ + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) + schedule_work(&cleanup_vm_area); +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -3718,8 +3747,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); - free_vm_area(area); - return NULL; + goto fail; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); @@ -3796,7 +3824,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - vfree(area->addr); + defer_vm_area_cleanup(area); return NULL; } -- cgit v1.2.3 From 8da89ba18ed4e9000d9b9b5b1f699e5004f4abf6 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:30 +0200 Subject: mm/vmalloc: handle non-blocking GFP in __vmalloc_area_node() Make __vmalloc_area_node() respect non-blocking GFP masks such as GFP_ATOMIC and GFP_NOWAIT. - Add memalloc_apply_gfp_scope()/memalloc_restore_scope() helpers to apply a proper scope. - Apply memalloc_apply_gfp_scope()/memalloc_restore_scope() around vmap_pages_range() for page table setup. - Set "nofail" to false if a non-blocking mask is used, as they are mutually exclusive. This is particularly important for page table allocations that internally use GFP_PGTABLE_KERNEL, which may sleep unless such scope restrictions are applied. For example: __pte_alloc_kernel() pte_alloc_one_kernel(&init_mm); pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); Note: in most cases, PTE entries are established only up to the level required by current vmap space usage, meaning the page tables are typically fully populated during the mapping process. Link: https://lkml.kernel.org/r/20251007122035.56347-6-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 2 ++ mm/vmalloc.c | 52 +++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 1e43181369f1..e8e94f90d686 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -332,4 +332,6 @@ bool vmalloc_dump_obj(void *object); static inline bool vmalloc_dump_obj(void *object) { return false; } #endif +unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask); +void memalloc_restore_scope(unsigned int flags); #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9e29dd767c41..d8bcd87239b5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3716,6 +3716,42 @@ static void defer_vm_area_cleanup(struct vm_struct *area) schedule_work(&cleanup_vm_area); } +/* + * Page tables allocations ignore external GFP. Enforces it by + * the memalloc scope API. It is used by vmalloc internals and + * KASAN shadow population only. + * + * GFP to scope mapping: + * + * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save() + * GFP_NOFS - memalloc_nofs_save() + * GFP_NOIO - memalloc_noio_save() + * + * Returns a flag cookie to pair with restore. + */ +unsigned int +memalloc_apply_gfp_scope(gfp_t gfp_mask) +{ + unsigned int flags = 0; + + if (!gfpflags_allow_blocking(gfp_mask)) + flags = memalloc_noreclaim_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + /* 0 - no scope applied. */ + return flags; +} + +void +memalloc_restore_scope(unsigned int flags) +{ + if (flags) + memalloc_flags_restore(flags); +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -3732,6 +3768,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, array_size = (unsigned long)nr_small_pages * sizeof(struct page *); + /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */ + if (!gfpflags_allow_blocking(gfp_mask)) + nofail = false; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -3797,22 +3837,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * page tables allocations ignore external gfp mask, enforce it * by the scope API */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - flags = memalloc_nofs_save(); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - flags = memalloc_noio_save(); - + flags = memalloc_apply_gfp_scope(gfp_mask); do { ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); - - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - memalloc_nofs_restore(flags); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - memalloc_noio_restore(flags); + memalloc_restore_scope(flags); if (ret < 0) { warn_alloc(gfp_mask, NULL, -- cgit v1.2.3 From b186a94227b753f2fdcab0df29dfc636c63ac329 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:32 +0200 Subject: kmsan: remove hard-coded GFP_KERNEL flags kmsan_vmap_pages_range_noflush() allocates its temp s_pages/o_pages arrays with GFP_KERNEL, which may sleep. This is inconsistent with vmalloc() as it will support non-blocking requests later. Plumb gfp_mask through the kmsan_vmap_pages_range_noflush(), so it can use it internally for its demand. Please note, the subsequent __vmap_pages_range_noflush() still uses GFP_KERNEL and can sleep. If a caller runs under reclaim constraints, sleeping is forbidden, it must establish the appropriate memalloc scope API. Link: https://lkml.kernel.org/r/20251007122035.56347-8-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Alexander Potapenko Cc: Marco Elver Cc: Andrey Ryabinin Cc: Baoquan He Cc: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 6 ++++-- mm/internal.h | 4 ++-- mm/kmsan/shadow.c | 6 +++--- mm/percpu-vm.c | 2 +- mm/vmalloc.c | 26 +++++++++++++++++--------- 5 files changed, 27 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index f2fd221107bb..7da9fd506b39 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -133,6 +133,7 @@ void kmsan_kfree_large(const void *ptr); * @prot: page protection flags used for vmap. * @pages: array of pages. * @page_shift: page_shift passed to vmap_range_noflush(). + * @gfp_mask: gfp_mask to use internally. * * KMSAN maps shadow and origin pages of @pages into contiguous ranges in * vmalloc metadata address range. Returns 0 on success, callers must check @@ -142,7 +143,8 @@ int __must_check kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift); + unsigned int page_shift, + gfp_t gfp_mask); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -347,7 +349,7 @@ static inline void kmsan_kfree_large(const void *ptr) static inline int __must_check kmsan_vmap_pages_range_noflush( unsigned long start, unsigned long end, pgprot_t prot, - struct page **pages, unsigned int page_shift) + struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return 0; } diff --git a/mm/internal.h b/mm/internal.h index 1561fc2ff5b8..e623c8103358 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1355,7 +1355,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift); + pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask); unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) @@ -1364,7 +1364,7 @@ static inline void vmalloc_init(void) static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) + pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return -EINVAL; } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 55fdea199aaf..e7f554a31bb4 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -215,7 +215,7 @@ void kmsan_free_page(struct page *page, unsigned int order) int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift) + unsigned int page_shift, gfp_t gfp_mask) { unsigned long shadow_start, origin_start, shadow_end, origin_end; struct page **s_pages, **o_pages; @@ -230,8 +230,8 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, return 0; nr = (end - start) / PAGE_SIZE; - s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); - o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + s_pages = kcalloc(nr, sizeof(*s_pages), gfp_mask); + o_pages = kcalloc(nr, sizeof(*o_pages), gfp_mask); if (!s_pages || !o_pages) { err = -ENOMEM; goto ret; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index cd69caf6aa8d..4f5937090590 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -194,7 +194,7 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), - PAGE_KERNEL, pages, PAGE_SHIFT); + PAGE_KERNEL, pages, PAGE_SHIFT, GFP_KERNEL); } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d8bcd87239b5..d7e7049e01f8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -671,16 +671,28 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) + pgprot_t prot, struct page **pages, unsigned int page_shift, + gfp_t gfp_mask) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, - page_shift); + page_shift, gfp_mask); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } +static int __vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift, + gfp_t gfp_mask) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask); + flush_cache_vmap(addr, end); + return err; +} + /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map @@ -696,11 +708,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - int err; - - err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); - flush_cache_vmap(addr, end); - return err; + return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL); } static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, @@ -3839,8 +3847,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, */ flags = memalloc_apply_gfp_scope(gfp_mask); do { - ret = vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift); + ret = __vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift, nested_gfp); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); -- cgit v1.2.3 From 7241bb2ea33d5ff50b77a5981342bcc826bef52a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:33 +0200 Subject: mm: skip might_alloc() warnings when PF_MEMALLOC is set might_alloc() catches invalid blocking allocations in contexts where sleeping is not allowed. However when PF_MEMALLOC is set, the page allocator already skips reclaim and other blocking paths. In such cases, a blocking gfp_mask does not actually lead to blocking, so triggering might_alloc() splats is misleading. Adjust might_alloc() to skip warnings when the current task has PF_MEMALLOC set, matching the allocator's actual blocking behaviour. Link: https://lkml.kernel.org/r/20251007122035.56347-9-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0232d983b715..a74582aed747 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -318,6 +318,9 @@ static inline void might_alloc(gfp_t gfp_mask) fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); + if (current->flags & PF_MEMALLOC) + return; + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } -- cgit v1.2.3 From 590c03ca6a3fbb114396673314e2aa483839608b Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 7 Oct 2025 18:28:21 +0800 Subject: mm/ksm: fix exec/fork inheritance support for prctl Patch series "ksm: fix exec/fork inheritance", v2. This series fixes exec/fork inheritance. See the detailed description of the issue below. This patch (of 2): Background ========== commit d7597f59d1d33 ("mm: add new api to enable ksm per process") introduced MMF_VM_MERGE_ANY for mm->flags, and allowed user to set it by prctl() so that the process's VMAs are forcibly scanned by ksmd. Subsequently, the 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") supported inheriting the MMF_VM_MERGE_ANY flag when a task calls execve(). Finally, commit 3a9e567ca45fb ("mm/ksm: fix ksm exec support for prctl") fixed the issue that ksmd doesn't scan the mm_struct with MMF_VM_MERGE_ANY by adding the mm_slot to ksm_mm_head in __bprm_mm_init(). Problem ======= In some extreme scenarios, however, this inheritance of MMF_VM_MERGE_ANY during exec/fork can fail. For example, when the scanning frequency of ksmd is tuned extremely high, a process carrying MMF_VM_MERGE_ANY may still fail to pass it to the newly exec'd process. This happens because ksm_execve() is executed too early in the do_execve flow (prematurely adding the new mm_struct to the ksm_mm_slot list). As a result, before do_execve completes, ksmd may have already performed a scan and found that this new mm_struct has no VM_MERGEABLE VMAs, thus clearing its MMF_VM_MERGE_ANY flag. Consequently, when the new program executes, the flag MMF_VM_MERGE_ANY inheritance missed. Root reason =========== commit d7597f59d1d33 ("mm: add new api to enable ksm per process") clear the flag MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs. Solution ======== Firstly, Don't clear MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs, because perhaps their mm_struct has just been added to ksm_mm_slot list, and its process has not yet officially started running or has not yet performed mmap/brk to allocate anonymous VMAS. Secondly, recheck MMF_VM_MERGEABLE again if a process takes MMF_VM_MERGE_ANY, and create a mm_slot and join it into ksm_scan_list again. Link: https://lkml.kernel.org/r/20251007182504440BJgK8VXRHh8TD7IGSUIY4@zte.com.cn Link: https://lkml.kernel.org/r/20251007182821572h_SoFqYZXEP1mvWI4n9VL@zte.com.cn Fixes: 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process") Signed-off-by: xu xin Cc: Stefan Roesch Cc: David Hildenbrand Cc: Jinjiang Tu Cc: Wang Yaxin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- include/linux/ksm.h | 4 ++-- mm/ksm.c | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 067538fc4d58..c982694c987b 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,7 +17,7 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); @@ -103,7 +103,7 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, +static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { return vm_flags; diff --git a/mm/ksm.c b/mm/ksm.c index cdefba633856..4f672f4f2140 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2712,8 +2712,14 @@ no_vmas: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); + /* + * Only clear MMF_VM_MERGEABLE. We must not clear + * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process, + * perhaps their mm_struct has just been added to ksm_mm_slot + * list, and its process has not yet officially started running + * or has not yet performed mmap/brk to allocate anonymous VMAS. + */ mm_flags_clear(MMF_VM_MERGEABLE, mm); - mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2831,12 +2837,20 @@ static int __ksm_del_vma(struct vm_area_struct *vma) * * Returns: @vm_flags possibly updated to mark mergeable. */ -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && - __ksm_should_add_vma(file, vm_flags)) + __ksm_should_add_vma(file, vm_flags)) { vm_flags |= VM_MERGEABLE; + /* + * Generally, the flags here always include MMF_VM_MERGEABLE. + * However, in rare cases, this flag may be cleared by ksmd who + * scans a cycle without finding any mergeable vma. + */ + if (unlikely(!mm_flags_test(MMF_VM_MERGEABLE, mm))) + __ksm_enter(mm); + } return vm_flags; } -- cgit v1.2.3 From 9ac09bb9feaccc2f45e5606dc48a3f748d478dc4 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 3 Oct 2025 16:53:04 +0100 Subject: mm: consistently use current->mm in mm_get_unmapped_area() mm_get_unmapped_area() is a wrapper around arch_get_unmapped_area() / arch_get_unmapped_area_topdown(), both of which search current->mm for some free space. Neither take an mm_struct - they implicitly operate on current->mm. But the wrapper takes an mm_struct and uses it to decide whether to search bottom up or top down. All callers pass in current->mm for this, so everything is working consistently. But it feels like an accident waiting to happen; eventually someone will call that function with a different mm, expecting to find free space in it, but what gets returned is free space in the current mm. So let's simplify by removing the parameter and have the wrapper use current->mm to decide which end to start at. Now everything is consistent and self-documenting. Link: https://lkml.kernel.org/r/20251003155306.2147572-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Dev Jain Reviewed-by: Anshuman Khandual Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/sparc/kernel/sys_sparc_64.c | 6 +++--- arch/x86/kernel/cpu/sgx/driver.c | 2 +- drivers/char/mem.c | 2 +- drivers/dax/device.c | 5 ++--- fs/hugetlbfs/inode.c | 3 +-- fs/proc/inode.c | 2 +- fs/ramfs/file-mmu.c | 2 +- include/linux/sched/mm.h | 9 ++++----- io_uring/memmap.c | 2 +- kernel/bpf/arena.c | 2 +- kernel/bpf/syscall.c | 2 +- mm/huge_memory.c | 4 ++-- mm/mmap.c | 17 +++++++---------- mm/shmem.c | 8 +++----- 14 files changed, 29 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 55faf2effa46..dbf118b40601 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -241,7 +241,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u if (flags & MAP_FIXED) { /* Ok, don't mess with it. */ - return mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags); } flags &= ~MAP_SHARED; @@ -254,7 +254,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u align_goal = (64UL * 1024); do { - addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, + addr = mm_get_unmapped_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags); if (!(addr & ~PAGE_MASK)) { addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL); @@ -273,7 +273,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u * be obtained. */ if (addr & ~PAGE_MASK) - addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); + addr = mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags); return addr; } diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 7f8d1e11dbee..3b3efadb8cae 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file, if (flags & MAP_FIXED) return addr; - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #ifdef CONFIG_COMPAT diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 34b815901b20..db1ca53a6d01 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -542,7 +542,7 @@ static unsigned long get_unmapped_area_zero(struct file *file, #ifdef CONFIG_TRANSPARENT_HUGEPAGE return thp_get_unmapped_area(file, addr, len, pgoff, flags); #else - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); #endif } #endif /* CONFIG_MMU */ diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 2bb40a6060af..7f1ed0db8337 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -330,14 +330,13 @@ static unsigned long dax_get_unmapped_area(struct file *filp, if ((off + len_align) < off) goto out; - addr_align = mm_get_unmapped_area(current->mm, filp, addr, len_align, - pgoff, flags); + addr_align = mm_get_unmapped_area(filp, addr, len_align, pgoff, flags); if (!IS_ERR_VALUE(addr_align)) { addr_align += (off - addr_align) & (align - 1); return addr_align; } out: - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); } static const struct address_space_operations dev_dax_aops = { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f42548ee9083..ce8e40d35032 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -184,8 +184,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) addr0 = ALIGN(addr, huge_page_size(h)); - return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, - flags, 0); + return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0); } /* diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d9b7ef122343..2d3425cfa94b 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -443,7 +443,7 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags); #endif return orig_addr; diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index b11f5b20b78b..c3ed1c5117b2 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } const struct file_operations ramfs_file_operations = { diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index a74582aed747..0e1d73955fa5 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -189,12 +189,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t); -unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags); +unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, - struct file *filp, +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, diff --git a/io_uring/memmap.c b/io_uring/memmap.c index add03ca75cb9..63fcfa757bb8 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -387,7 +387,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, #else addr = 0UL; #endif - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 1074ac4459f2..872dc0e41c65 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -334,7 +334,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad return -EINVAL; } - ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); + ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a129746bd6c..d77685f2c6cb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1162,7 +1162,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); #else return addr; #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f2a521e5d68..32479ae27400 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1127,7 +1127,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (len_pad < len || (off + len_pad) < off) return 0; - ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, + ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad, off >> PAGE_SHIFT, flags, vm_flags); /* @@ -1164,7 +1164,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add if (ret) return ret; - return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, + return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags); } diff --git a/mm/mmap.c b/mm/mmap.c index 5fd3b80fda1d..644f02071a41 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -797,12 +797,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, } #endif -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags, - vm_flags_t vm_flags) +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { - if (mm_flags_test(MMF_TOPDOWN, mm)) + if (mm_flags_test(MMF_TOPDOWN, current->mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -848,7 +847,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } else { - addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, + addr = mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } if (IS_ERR_VALUE(addr)) @@ -864,12 +863,10 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, } unsigned long -mm_get_unmapped_area(struct mm_struct *mm, struct file *file, - unsigned long addr, unsigned long len, +mm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area_vmflags(mm, file, addr, len, - pgoff, flags, 0); + return mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); diff --git a/mm/shmem.c b/mm/shmem.c index 58701d14dd96..0eecb486a0cb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2759,8 +2759,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (len > TASK_SIZE) return -ENOMEM; - addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, - flags); + addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2838,8 +2837,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, - inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) @@ -5775,7 +5773,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #endif -- cgit v1.2.3 From ada5cbe33a5321f8c896a3362c3aafa0bf262110 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Thu, 9 Oct 2025 20:54:03 +0500 Subject: kasan: cleanup of kasan_enabled() checks Deduplication of kasan_enabled() checks which are already used by callers. * Altered functions: check_page_allocation Delete the check because callers have it already in __wrappers in include/linux/kasan.h: __kasan_kfree_large __kasan_mempool_poison_pages __kasan_mempool_poison_object kasan_populate_vmalloc, kasan_release_vmalloc Add __wrappers in include/linux/kasan.h. They are called externally in mm/vmalloc.c. __kasan_unpoison_vmalloc, __kasan_poison_vmalloc Delete checks because there're already kasan_enabled() checks in respective __wrappers in include/linux/kasan.h. release_free_meta -- Delete the check because the higher caller path has it already. See the stack trace: __kasan_slab_free -- has the check already __kasan_mempool_poison_object -- has the check already poison_slab_object kasan_save_free_info release_free_meta kasan_enabled() -- Delete here Link: https://lkml.kernel.org/r/20251009155403.1379150-3-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Baoquan He Cc: Christophe Leroy Cc: Dmitriy Vyukov Cc: "Ritesh Harjani (IBM)" Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/kasan.h | 20 ++++++++++++++++++-- mm/kasan/common.c | 3 --- mm/kasan/generic.c | 3 --- mm/kasan/shadow.c | 20 ++++---------------- 4 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d12e1a5f5a9a..f335c1d7b61d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -571,11 +571,27 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); -void kasan_release_vmalloc(unsigned long start, unsigned long end, +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); +static inline int kasan_populate_vmalloc(unsigned long addr, + unsigned long size, gfp_t gfp_mask) +{ + if (kasan_enabled()) + return __kasan_populate_vmalloc(addr, size, gfp_mask); + return 0; +} +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags); +static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end, + unsigned long flags) +{ + if (kasan_enabled()) + return __kasan_release_vmalloc(start, end, free_region_start, + free_region_end, flags); +} #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d4c14359feaf..22e5d67ff064 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -305,9 +305,6 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, static inline bool check_page_allocation(void *ptr, unsigned long ip) { - if (!kasan_enabled()) - return false; - if (ptr != page_address(virt_to_head_page(ptr))) { kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 516b49accc4f..2b8e73f5f6a7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -506,9 +506,6 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { - if (!kasan_enabled()) - return; - /* Check if free meta is valid. */ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META) return; diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index a30d84bfdd52..29a751a8a08d 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -354,7 +354,7 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) +static int __kasan_populate_vmalloc_do(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; @@ -395,14 +395,11 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_ return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; - if (!kasan_enabled()) - return 0; - if (!is_vmalloc_or_module_addr((void *)addr)) return 0; @@ -424,7 +421,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mas shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); + ret = __kasan_populate_vmalloc_do(shadow_start, shadow_end, gfp_mask); if (ret) return ret; @@ -566,7 +563,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, * pages entirely covered by the free region, we will not run in to any * trouble - any simultaneous allocations will be for disjoint regions. */ -void kasan_release_vmalloc(unsigned long start, unsigned long end, +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags) @@ -575,9 +572,6 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; - if (!kasan_enabled()) - return; - region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -626,9 +620,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ - if (!kasan_enabled()) - return (void *)start; - if (!is_vmalloc_or_module_addr(start)) return (void *)start; @@ -651,9 +642,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { - if (!kasan_enabled()) - return; - if (!is_vmalloc_or_module_addr(start)) return; -- cgit v1.2.3 From eb8762dc220c0b0573100a941bfc68df34ece74f Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 14 Oct 2025 21:09:16 +0530 Subject: drivers/base/node: fold register_node() into register_one_node() Patch series "drivers/base/node: fold node register and unregister functions", v2. The first patch merges register_one_node() and register_node(), leaving a single register_node() function. The second patch merges unregister_one_node() and unregister_node(), leaving a single unregister_node() function. There are no functional changes in these patches. This patch (of 2): register_node() is only called from register_one_node(). This patch folds register_node() into its only caller and renames register_one_node() to register_node(). This reduces unnecessary indirection and simplifies the code structure. No functional changes are introduced. [akpm@linux-foundation.org: fix kerneldoc, per David] Link: https://lkml.kernel.org/r/cover.1760097207.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/910853c9dd61f7a2190a56cba101e73e9c6859be.1760097207.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Acked-by: David Hildenbrand Cc: Aboorva Devarajan Cc: Christophe Leroy Cc: Danilo Krummrich Cc: Dave Jiang Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Oscar Salvador Cc: Peter Zijlstra Cc: "Ritesh Harjani (IBM)" Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/pci_dlpar.c | 2 +- arch/x86/mm/numa.c | 4 +-- drivers/base/node.c | 52 ++++++++++++------------------ include/linux/node.h | 4 +-- mm/memory_hotplug.c | 4 +-- mm/mm_init.c | 2 +- 6 files changed, 28 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index aeb8633a3d00..8c77ec7980de 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -29,7 +29,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) nid = of_node_to_nid(dn); if (likely((nid) >= 0)) { if (!node_online(nid)) { - if (register_one_node(nid)) { + if (register_node(nid)) { pr_err("PCI: Failed to register node %d\n", nid); } else { update_numa_distance(dn); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c24890c40138..7a97327140df 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -262,7 +262,7 @@ void __init init_gi_nodes(void) * bringup_nonboot_cpus * cpu_up * __try_online_node - * register_one_node + * register_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ @@ -303,7 +303,7 @@ void __init init_cpu_to_node(void) * bringup_nonboot_cpus * cpu_up * __try_online_node - * register_one_node + * register_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ diff --git a/drivers/base/node.c b/drivers/base/node.c index 83aeb0518e1d..17d7b90403ff 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -676,33 +676,6 @@ static void node_device_release(struct device *dev) kfree(to_node(dev)); } -/* - * register_node - Setup a sysfs device for a node. - * @num - Node number to use when creating the device. - * - * Initialize and register the node device. - */ -static int register_node(struct node *node, int num) -{ - int error; - - node->dev.id = num; - node->dev.bus = &node_subsys; - node->dev.release = node_device_release; - node->dev.groups = node_dev_groups; - error = device_register(&node->dev); - - if (error) { - put_device(&node->dev); - } else { - hugetlb_register_node(node); - compaction_register_node(node); - reclaim_register_node(node); - } - - return error; -} - /** * unregister_node - unregister a node device * @node: node going away @@ -907,7 +880,13 @@ void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn, } #endif /* CONFIG_MEMORY_HOTPLUG */ -int register_one_node(int nid) +/** + * register_node - Initialize and register the node device. + * @nid: Node number to use when creating the device. + * + * Return: 0 on success, -errno otherwise + */ +int register_node(int nid) { int error; int cpu; @@ -918,14 +897,23 @@ int register_one_node(int nid) return -ENOMEM; INIT_LIST_HEAD(&node->access_list); - node_devices[nid] = node; - error = register_node(node_devices[nid], nid); + node->dev.id = nid; + node->dev.bus = &node_subsys; + node->dev.release = node_device_release; + node->dev.groups = node_dev_groups; + + error = device_register(&node->dev); if (error) { - node_devices[nid] = NULL; + put_device(&node->dev); return error; } + node_devices[nid] = node; + hugetlb_register_node(node); + compaction_register_node(node); + reclaim_register_node(node); + /* link cpu under this node */ for_each_present_cpu(cpu) { if (cpu_to_node(cpu) == nid) @@ -1018,7 +1006,7 @@ void __init node_dev_init(void) * to already created cpu devices. */ for_each_online_node(i) { - ret = register_one_node(i); + ret = register_node(i); if (ret) panic("%s() failed to add node: %d\n", __func__, ret); } diff --git a/include/linux/node.h b/include/linux/node.h index 866e3323f1fd..b7028d3ec3b4 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -176,7 +176,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) #ifdef CONFIG_NUMA extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ -extern int register_one_node(int nid); +int register_node(int nid); extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); @@ -189,7 +189,7 @@ extern int register_memory_node_under_compute_node(unsigned int mem_nid, static inline void node_dev_init(void) { } -static inline int register_one_node(int nid) +static inline int register_node(int nid) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0be83039c3b5..6c050d867031 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1311,7 +1311,7 @@ static int __try_online_node(int nid, bool set_node_online) if (set_node_online) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); BUG_ON(ret); } out: @@ -1542,7 +1542,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error_memblock_remove; if (ret) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); if (WARN_ON(ret)) { node_set_offline(nid); goto error_memblock_remove; diff --git a/mm/mm_init.c b/mm/mm_init.c index 7712d887b696..c6812b4dbb2e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1909,7 +1909,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) free_area_init_node(nid); /* - * No sysfs hierarchy will be created via register_one_node() + * No sysfs hierarchy will be created via register_node() *for memory-less node because here it's not marked as N_MEMORY *and won't be set online later. The benefit is userspace *program won't be confused by sysfs files/directories of -- cgit v1.2.3 From d945667dcb1996ddf00ffa8408b579e4ce573652 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 14 Oct 2025 21:09:17 +0530 Subject: drivers/base/node: fold unregister_node() into unregister_one_node() unregister_node() is only called from unregister_one_node(). This patch folds unregister_node() into its only caller and renames unregister_one_node() to unregister_node(). This reduces unnecessary indirection and simplifies the code structure. No functional changes are introduced. [donettom@linux.ibm.com: remove extra spaces before @nid and "All"] Link: https://lkml.kernel.org/r/cff01514-9074-4c97-bcf1-d4e3594e48b0@linux.ibm.com Link: https://lkml.kernel.org/r/32b7d5d8f0f30d313c3e1d8798f591459c8746f9.1760097208.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Cc: Aboorva Devarajan Cc: Christophe Leroy Cc: Danilo Krummrich Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Oscar Salvador Cc: Peter Zijlstra Cc: "Ritesh Harjani (IBM)" Signed-off-by: Andrew Morton --- drivers/base/node.c | 38 +++++++++++++++++--------------------- include/linux/node.h | 6 ++---- mm/memory_hotplug.c | 4 ++-- 3 files changed, 21 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index 17d7b90403ff..00cf4532f121 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -676,23 +676,6 @@ static void node_device_release(struct device *dev) kfree(to_node(dev)); } -/** - * unregister_node - unregister a node device - * @node: node going away - * - * Unregisters a node device @node. All the devices on the node must be - * unregistered before calling this function. - */ -void unregister_node(struct node *node) -{ - hugetlb_unregister_node(node); - compaction_unregister_node(node); - reclaim_unregister_node(node); - node_remove_accesses(node); - node_remove_caches(node); - device_unregister(&node->dev); -} - struct node *node_devices[MAX_NUMNODES]; /* @@ -924,13 +907,26 @@ int register_node(int nid) return error; } - -void unregister_one_node(int nid) +/** + * unregister_node - unregister a node device + * @nid: nid of the node going away + * + * Unregisters the node device at node id @nid. All the devices on the + * node must be unregistered before calling this function. + */ +void unregister_node(int nid) { - if (!node_devices[nid]) + struct node *node = node_devices[nid]; + + if (!node) return; - unregister_node(node_devices[nid]); + hugetlb_unregister_node(node); + compaction_unregister_node(node); + reclaim_unregister_node(node); + node_remove_accesses(node); + node_remove_caches(node); + device_unregister(&node->dev); node_devices[nid] = NULL; } diff --git a/include/linux/node.h b/include/linux/node.h index b7028d3ec3b4..0269b064ba65 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -132,8 +132,6 @@ static inline void register_memory_blocks_under_nodes(void) } #endif -extern void unregister_node(struct node *node); - struct node_notify { int nid; }; @@ -177,7 +175,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ int register_node(int nid); -extern void unregister_one_node(int nid); +void unregister_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); @@ -193,7 +191,7 @@ static inline int register_node(int nid) { return 0; } -static inline int unregister_one_node(int nid) +static inline int unregister_node(int nid) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6c050d867031..94a8f6e8811a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1596,7 +1596,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) error: if (new_node) { node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) @@ -2201,7 +2201,7 @@ void try_offline_node(int nid) * node now. */ node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } EXPORT_SYMBOL(try_offline_node); -- cgit v1.2.3 From 0acc67c4030c39f39ac90413cc5d0abddd3a9527 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 14 Oct 2025 07:50:08 -0700 Subject: mm/page_alloc/vmstat: simplify refresh_cpu_vm_stats change detection Patch series "mm/page_alloc: Batch callers of free_pcppages_bulk", v5. Motivation & Approach ===================== While testing workloads with high sustained memory pressure on large machines in the Meta fleet (1Tb memory, 316 CPUs), we saw an unexpectedly high number of softlockups. Further investigation showed that the zone lock in free_pcppages_bulk was being held for a long time, and was called to free 2k+ pages over 100 times just during boot. This causes starvation in other processes for the zone lock, which can lead to the system stalling as multiple threads cannot make progress without the locks. We can see these issues manifesting as warnings: [ 4512.591979] rcu: INFO: rcu_sched self-detected stall on CPU [ 4512.604370] rcu: 20-....: (9312 ticks this GP) idle=a654/1/0x4000000000000000 softirq=309340/309344 fqs=5426 [ 4512.626401] rcu: hardirqs softirqs csw/system [ 4512.638793] rcu: number: 0 145 0 [ 4512.651177] rcu: cputime: 30 10410 174 ==> 10558(ms) [ 4512.666657] rcu: (t=21077 jiffies g=783665 q=1242213 ncpus=316) While these warnings don't indicate a crash or a kernel panic, they do point to the underlying issue of lock contention. To prevent starvation in both locks, batch the freeing of pages using pcp->batch. Because free_pcppages_bulk is called with the pcp lock and acquires the zone lock, relinquishing and reacquiring the locks are only effective when both of them are broken together (unless the system was built with queued spinlocks). Thus, instead of modifying free_pcppages_bulk to break both locks, batch the freeing from its callers instead. A similar fix has been implemented in the Meta fleet, and we have seen significantly less softlockups. Testing ======= The following are a few synthetic benchmarks, made on three machines. The first is a large machine with 754GiB memory and 316 processors. The second is a relatively smaller machine with 251GiB memory and 176 processors. The third and final is the smallest of the three, which has 62GiB memory and 36 processors. On all machines, I kick off a kernel build with -j$(nproc). Negative delta is better (faster compilation). Large machine (754GiB memory, 316 processors) make -j$(nproc) +------------+---------------+-----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+-----------+ | real | 0.8070 | - 1.4865 | | user | 0.2823 | + 0.4081 | | sys | 5.0267 | -11.8737 | +------------+---------------+-----------+ Medium machine (251GiB memory, 176 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.2806 | +0.0351 | | user | 0.0994 | +0.3170 | | sys | 0.6229 | -0.6277 | +------------+---------------+----------+ Small machine (62GiB memory, 36 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.1503 | -2.6585 | | user | 0.0431 | -2.2984 | | sys | 0.1870 | -3.2013 | +------------+---------------+----------+ Here, variation is the coefficient of variation, i.e. standard deviation / mean. Based on these results, it seems like there are varying degrees to how much lock contention this reduces. For the largest and smallest machines that I ran the tests on, it seems like there is quite some significant reduction. There is also some performance increases visible from userspace. Interestingly, the performance gains don't scale with the size of the machine, but rather there seems to be a dip in the gain there is for the medium-sized machine. One possible theory is that because the high watermark depends on both memory and the number of local CPUs, what impacts zone contention the most is not these individual values, but rather the ratio of mem:processors. This patch (of 5): Currently, refresh_cpu_vm_stats returns an int, indicating how many changes were made during its updates. Using this information, callers like vmstat_update can heuristically determine if more work will be done in the future. However, all of refresh_cpu_vm_stats's callers either (a) ignore the result, only caring about performing the updates, or (b) only care about whether changes were made, but not *how many* changes were made. Simplify the code by returning a bool instead to indicate if updates were made. In addition, simplify fold_diff and decay_pcp_high to return a bool for the same reason. Link: https://lkml.kernel.org/r/20251014145011.3427205-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20251014145011.3427205-2-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Reviewed-by: Vlastimil Babka Reviewed-by: SeongJae Park Cc: Brendan Jackman Cc: Chris Mason Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- mm/page_alloc.c | 8 ++++---- mm/vmstat.c | 28 +++++++++++++++------------- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 623bee335383..b155929af5b1 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -387,7 +387,7 @@ extern void free_pages(unsigned long addr, unsigned int order); #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 10a908793b4c..f057ce5ea7da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2557,10 +2557,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * Called from the vmstat counter updater to decay the PCP high. * Return whether there are addition works to do. */ -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { int high_min, to_drain, batch; - int todo = 0; + bool todo = false; high_min = READ_ONCE(pcp->high_min); batch = READ_ONCE(pcp->batch); @@ -2573,7 +2573,7 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), pcp->high - (pcp->high >> 3), high_min); if (pcp->high > high_min) - todo++; + todo = true; } to_drain = pcp->count - pcp->high; @@ -2581,7 +2581,7 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) spin_lock(&pcp->lock); free_pcppages_bulk(zone, to_drain, pcp, 0); spin_unlock(&pcp->lock); - todo++; + todo = true; } return todo; diff --git a/mm/vmstat.c b/mm/vmstat.c index bb09c032eecf..98855f31294d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -771,25 +771,25 @@ EXPORT_SYMBOL(dec_node_page_state); /* * Fold a differential into the global counters. - * Returns the number of counters updated. + * Returns whether counters were updated. */ static int fold_diff(int *zone_diff, int *node_diff) { int i; - int changes = 0; + bool changed = false; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (zone_diff[i]) { atomic_long_add(zone_diff[i], &vm_zone_stat[i]); - changes++; + changed = true; } for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) if (node_diff[i]) { atomic_long_add(node_diff[i], &vm_node_stat[i]); - changes++; + changed = true; } - return changes; + return changed; } /* @@ -806,16 +806,16 @@ static int fold_diff(int *zone_diff, int *node_diff) * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. * - * The function returns the number of global counters updated. + * The function returns whether global counters were updated. */ -static int refresh_cpu_vm_stats(bool do_pagesets) +static bool refresh_cpu_vm_stats(bool do_pagesets) { struct pglist_data *pgdat; struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; - int changes = 0; + bool changed = false; for_each_populated_zone(zone) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; @@ -839,7 +839,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets) if (do_pagesets) { cond_resched(); - changes += decay_pcp_high(zone, this_cpu_ptr(pcp)); + if (decay_pcp_high(zone, this_cpu_ptr(pcp))) + changed = true; #ifdef CONFIG_NUMA /* * Deal with draining the remote pageset of this @@ -861,13 +862,13 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } if (__this_cpu_dec_return(pcp->expire)) { - changes++; + changed = true; continue; } if (__this_cpu_read(pcp->count)) { drain_zone_pages(zone, this_cpu_ptr(pcp)); - changes++; + changed = true; } #endif } @@ -887,8 +888,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } - changes += fold_diff(global_zone_diff, global_node_diff); - return changes; + if (fold_diff(global_zone_diff, global_node_diff)) + changed = true; + return changed; } /* -- cgit v1.2.3 From d929525c2e30abee621bf71f143ba6104c81ff2b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 16 Oct 2025 09:10:35 -0700 Subject: memcg: net: track network throttling due to memcg memory pressure The kernel can throttle network sockets if the memory cgroup associated with the corresponding socket is under memory pressure. The throttling actions include clamping the transmit window, failing to expand receive or send buffers, aggressively prune out-of-order receive queue, FIN deferred to a retransmitted packet and more. Let's add memcg metric to track such throttling actions. At the moment memcg memory pressure is defined through vmpressure and in future it may be defined using PSI or we may add more flexible way for the users to define memory pressure, maybe through ebpf. However the potential throttling actions will remain the same, so this newly introduced metric will continue to track throttling actions irrespective of how memcg memory pressure is defined. Link: https://lkml.kernel.org/r/20251016161035.86161-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Daniel Sedlak Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kacinski Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Neal Cardwell Cc: Paolo Abeni Cc: Simon Horman Cc: Tejun Heo Cc: Willem de Bruijn Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ include/linux/memcontrol.h | 1 + include/net/sock.h | 6 +++++- kernel/cgroup/cgroup.c | 1 + mm/memcontrol.c | 3 +++ 5 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0e6c67ac585a..3345961c30ac 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1515,6 +1515,10 @@ The following nested keys are defined. oom_group_kill The number of times a group OOM has occurred. + sock_throttled + The number of times network sockets associated with + this cgroup are throttled. + memory.events.local Similar to memory.events but the fields in the file are local to the cgroup i.e. not hierarchical. The file modified event diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 873e510d6f8d..5fe254813123 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,6 +52,7 @@ enum memcg_memory_event { MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, + MEMCG_SOCK_THROTTLED, MEMCG_NR_MEMORY_EVENTS, }; diff --git a/include/net/sock.h b/include/net/sock.h index 60bcb13f045c..ff7d49af1619 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2635,8 +2635,12 @@ static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) #endif /* CONFIG_MEMCG_V1 */ do { - if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) + if (time_before64(get_jiffies_64(), + mem_cgroup_get_socket_pressure(memcg))) { + memcg_memory_event(mem_cgroup_from_sk(sk), + MEMCG_SOCK_THROTTLED); return true; + } } while ((memcg = parent_mem_cgroup(memcg))); return false; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index fdee387f0d6b..8df671c59987 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4704,6 +4704,7 @@ void cgroup_file_notify(struct cgroup_file *cfile) } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +EXPORT_SYMBOL_GPL(cgroup_file_notify); /** * cgroup_file_show - show or hide a hidden cgroup file diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ae5cbcaed75..976412c8196e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -81,6 +81,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; +EXPORT_SYMBOL(root_mem_cgroup); /* Active memory cgroup to use from an interrupt context */ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); @@ -4463,6 +4464,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) atomic_long_read(&events[MEMCG_OOM_KILL])); seq_printf(m, "oom_group_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); + seq_printf(m, "sock_throttled %lu\n", + atomic_long_read(&events[MEMCG_SOCK_THROTTLED])); } static int memory_events_show(struct seq_file *m, void *v) -- cgit v1.2.3 From 2f05435df9320e70f7a98149eb4b043ff361a120 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 17 Oct 2025 15:53:07 +0800 Subject: mm: vmscan: simplify the logic for activating dirty file folios After commit 6b0dfabb3555 ("fs: Remove aops->writepage"), we no longer attempt to write back filesystem folios through reclaim. However, in the shrink_folio_list() function, there still remains some logic related to writeback control of dirty file folios. The original logic was that, for direct reclaim, or when folio_test_reclaim() is false, or the PGDAT_DIRTY flag is not set, the dirty file folios would be directly activated to avoid being scanned again; otherwise, it will try to writeback the dirty file folios. However, since we can no longer perform writeback on dirty folios, the dirty file folios will still be activated. Additionally, under the original logic, if we continue to try writeback dirty file folios, we will also check the references flag, sc->may_writepage, and may_enter_fs(), which may result in dirty file folios being left in the inactive list. This is unreasonable. Even if these dirty folios are scanned again, we still cannot clean them. Therefore, the checks on these dirty file folios appear to be redundant and can be removed. Dirty file folios should be directly moved to the active list to avoid being scanned again. Since we set the PG_reclaim flag for the dirty folios, once the writeback is completed, they will be moved back to the tail of the inactive list to be retried for quick reclaim. Link: https://lkml.kernel.org/r/ba5c49955fd93c6850bcc19abf0e02e1573768aa.1760687075.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ---- mm/vmscan.c | 25 +++---------------------- 2 files changed, 3 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7fb7331c5725..4398e027f450 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1060,10 +1060,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum pgdat_flags { - PGDAT_DIRTY, /* reclaim scanning has recently found - * many dirty file pages at the tail - * of the LRU. - */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ diff --git a/mm/vmscan.c b/mm/vmscan.c index e53ac12cc802..ecc90517b791 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1409,21 +1409,7 @@ retry: mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { - /* - * Only kswapd can writeback filesystem folios - * to avoid risk of stack overflow. But avoid - * injecting inefficient single-folio I/O into - * flusher writeback as much as possible: only - * write folios when we've encountered many - * dirty folios, and when we've already scanned - * the rest of the LRU for clean folios and see - * the same dirty folios again (with the reclaim - * flag set). - */ - if (folio_is_file_lru(folio) && - (!current_is_kswapd() || - !folio_test_reclaim(folio) || - !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + if (folio_is_file_lru(folio)) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() @@ -1432,7 +1418,8 @@ retry: */ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, nr_pages); - folio_set_reclaim(folio); + if (!folio_test_reclaim(folio)) + folio_set_reclaim(folio); goto activate_locked; } @@ -6127,11 +6114,6 @@ again: if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); - /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty && - sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - /* * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it @@ -6872,7 +6854,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat) clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); - clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } -- cgit v1.2.3 From d3946c5f4c1c5db63532eb433a55c7d881de1389 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:53 -0700 Subject: mm/damon: document damos_quota_goal->nid use case Patch series "mm/damon: allow DAMOS auto-tuned for per-memcg per-node memory usage". Introduce two new DAMOS quota auto-tuning target metrics for per-cgroup per-NUMA node memory utilization. Expected use cases are cgroup level access-aware NUMA memory managements, such as memory tiering or proactive reclamation on cgroup-based multi-tenant NUMA systems. Background ========== The aim-oriented aggressiveness auto-tuning feature of DAMOS is a highly recommended way for modern DAMOS use cases. Using it, users can specify what system status they want to achieve with what access-aware system operations. For example, reclaim cold memory aiming for 0.5 percent of memory pressure (proactive reclaim), or migrate hot and cold memory between NUMA nodes having different speed (memory tiering). Then DAMOS automatically adjusts the aggressiveness of the system operation (e.g., increase/decrease reclaim target coldness threshold) based on current status of the system. The use case is limited by the supported system status metrics for specifying the target system status. Two new system metrics for per-node memory usage ratio, namely DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, were recently added to extend the use cases for access-aware NUMA nodes management, such as memory tiering. Those are expected to be useful for not only memory tiering but also general access-aware inter-NUMA node page migration, though. Limitation ---------- The per-node memory usage based auto-tuning can be applied only system-wide. For cgroups-based multi-tenant systems, it could arguably harm the fairness. For example, a cgroup may use faster NUMA node memory more than other cgroup, depending on their access pattern. If the user of each cgroup are promised to get the same quality and amount of the system resource, this can arguably be an unfair situation. DAMOS supports cgroup level system operations via DAMOS filter. But the quota auto-tuning system is not aware of cgroups. New DAMOS Quota Tuning Metrics for Per-Cgroup Per-NUMA Memory Usage =================================================================== To overcome the limitation, introduce two new DAMOS quota auto-tuning goal metrics, namely DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP. Those can be thought of as a variant of DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP that extended for cgroups. The two metrics specifies per-cgroup, per-node amount of used and unused memory in ratio to the total memory of the node. For example, let's assume a system has two NUMA nodes of size 100 GiB and 50 GiB. And two cgroups are using 40 GiB and 60 GiB of node 0, 20 GiB and 10 GiB of node 1, respectively, as illustrated by the below table. node-0 node-1 Total memory 100 GiB 50 GiB Cgroup A usage 40 GiB 20 GiB Cgroup B usage 60 GiB 10 GiB Then, DAMOS_QUOTA_NODE_MEMCG_USED_BP for the cgroups for the first node are, 40 GiB / 100 GiB = 4,000 bp (40 percent) and 60 GiB / 100 GiB = 6,000 bp (60 percent), respectively. Those for the second node are, 20 GiB / 50 GiB = 4000 bp (40 percent) and 10 GiB / 50 GiB = 2000 bp (20 percent), respectively. DAMOS_QUOTA_NODE_MEMCG_FREE_BP for the four cases are, 60 GiB /100 GiB = 6000 bp, 40 GiB / 100 GiB = 4000 bp, 30 GiB / 50 GiB = 6000 bp, and 40 GiB / 50 GiB = 8000 bp, respectively. DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-0: 4000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-0: 6000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-1: 4000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-1: 2000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-0: 6000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-0: 4000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-1: 6000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-1: 8000 bp Using these, users can specify how much [un]used amount of memory for per-cgroup and per-node DAMOS should make as a result of the auto-tuning. Example Usecase: Cgroup Level Memory Tiering ============================================ Let's suppose a typical and simple tiered memory system. The system equips two NUMA nodes. The first node (node 0) is CPU-attached and fast. The second node (node 1) is CPU-unattached and slow. It runs two cgroups that desire to use about 30 percent and 70 percent of the faster node as much as possible for their hot data, respectively. Then, the user can implement DAMOS-based memory tiering for the system using the DAMON user-space tool (damo), like below. # ./damo start \ `# kdamond for node 1 (slow)` \ --numa_node 1 --monitoring_intervals_goal 4% 3 5ms 10s \ `# promotion scheme for cgroup a` \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter allow young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_used_bp 29.7% 0 /workloads/a \ \ `# promotion scheme for cgroup b` \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/b \ --damos_filter allow young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_used_bp 69.7% 0 workloads/b \ \ `# kdamond for node 0 (fast)` \ --numa_node 0 --monitoring_intervals_goal 4% 3 5ms 10s \ `# demotion scheme for cgroup a` \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter reject young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_free_bp 70.5% 0 \ \ `# demotion scheme for cgroup b` \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter reject young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_free_bp 30.5% 0 \ \ --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 1 1 1 1 \ --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1 With the command, the user-space tool will ask DAMON to spawn two kernel threads, each for monitoring accesses to node 1 (slow) and node 0 (fast), respectively. It installs two DAMOS schemes on each thread. Let's call them "promotion scheme for cgroup a/b", and "demotion scheme for cgroup a/b" in the order. The promotion schemes are installed on the DAMON thread for node 1 (slow), and demotion schemes are installed on the DAMON thread for node 0 (fast). Cgroup Level Hot Pages Migration (Promotion) -------------------------------------------- Promotion schemes will find memory regions on node 1 (slow), that some access was detected. The schemes will then migrate the found memory to node 0 (fast), hottest pages first. For accurate and effective migration, these schemes use two page level filters. First, the migration will be filtered for only cgroup A and cgroup B. That is, "promotion scheme for cgroup B" will not do the migration if the page is for cgroup A. Secondly, the schemes will ignore pages that having their page table's Accessed bits unset. The per-page Accessed bit check logic will also unset the bit if it was set, for the next check. For controlled amounts of system resource consumption and aiming on the target memory usage, the schemes use quotas setup. The migration is limited to be done only up to 200 MiB per second, to limit the peak system resource usage. And DAMOS_QUOTA_NODE_MEMCG_USED_BP target is set for 29.7% and 69.7% of node 0 (fast), respectively. The target value is lower than the high level goal (30% and 70% system memory), to give headroom on node 0 (fast). DAMOS will adjust the speed of the pages migration based on the target and current per-cgroup node 0 memory usage. For example, if cgroup A is utilizing only 10% of node 0, DAMOS will try to migrate more of cgroup A hot pages from node 1 to node 0, up to 200 MiB per second. If cgroup A utilizes more than 29.7% of node 0 memory, the cgroup A hot pages migration from node 1 to node 0 will be slowed and eventually stopped. Cgroup Level Cold Pages Migration (Demotion) -------------------------------------------- Demotion schemes are similar to promotion schemes, but differ in filtering setup and quota tuning setup. Those filter out pages having their page table Accessed bits set. And set 70.5% and 30.5% of node 0 memory free rate for the cgroup A and B, respectively. Hence, if promotion schemes or something made cgroup A and/or B uses more than 29.5% and 69.5% of node 0, demotion schemes will start migrating cold pages of appropriate cgroups in node 0 to node 1, under the 200 MiB per second speed cap, while adjusting the speed based on how much more than wanted memory is being used. The quota target values are set to overlap with promotion targets, to keep a minimum level of page exchanges between the nodes. This is to avoid a case that the target memory utilization is met, and then access pattern changes (pages in node 1 become hotter than pages in node 0) while the memory utilization is unchanged. Without the overlap, neither promotion of hotter pages in node 1, nor demotion of colder pages in node 0 will happen since both goals are met. As a result, the faster and slower node will unexpectedly serve cold and hot data. Test: Per-cgroup Memory Tiering =============================== I ran a simplified cgroup level memory tiering using the feature, and confirmed it works as intended. Setup ----- I configured a QEMU virtual machine representing a simplified version of the system that described on the above cgroup level memory tiering example use case. The system equips 40 CPU cores and two NUMA nodes each having 30 GiB physical memory. The first node (node 0) represents the faster NUMA node, and the second node (node 1) represents the slower NUMA node. In specific, below qemu command line options are used. [...] -object memory-backend-ram,size=30G,id=m0 \ -object memory-backend-ram,size=30G,id=m1 \ -numa node,cpus=0-39,memdev=m0 \ -numa node,memdev=m1 \ [...] I booted the virtual machine with a kernel that this patch series is applied. On the virtual machine, I created two cgroups, namely workload_a and workload_b. And ran a test program in each cgroup, resulting in one process per cgroup. The test program allocates 10 GiB memory and evenly split it into 10 regions. After the allocation, it repeatedly access the first region for one minute, than the second one for one minute, and so on. After the one minute repeated access for the 10-th region is done, it repeats the access from the first region. So the process has 10 GiB of data in total, but only 1 GiB of it is hot at a given moment, and the hot data is gradually changed. While the processes are running, run DAMON for a simple access-aware memory tiering using below script. It migrates hot and cold data of the cgroups into node 0 and node 1, aiming the first and the second cgroups (workload_a and workload_b, respectively) utilizing about 9.7 percent and 19.7 percent of node 0, respectively. Note that this setup is a simplified version of the above example use case, for ease of test. Also note that we assigned 30 GiB physical memory to node 0, but DAMON in this setup works for only 27 GiB of the memory. It is due to an internal implementation detail of DAMON user-space tool that not really important for this test. #!/bin/bash damo start \ --numa_node 1 \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_a \ --damos_filter allow young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_used_bp 9.7% 0 /workload_a \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_b \ --damos_filter allow young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_used_bp 19.7% 0 /workload_b \ --numa_node 0 \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_a \ --damos_filter reject young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_free_bp 90.5% 0 /workload_a \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_b \ --damos_filter reject young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_free_bp 80.5% 0 /workload_b \ --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 2 2 2 2 \ --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1 After starting DAMON, the pages continuously be migrated across nodes. A few minutes later, the memory usage of the cgroups converges into the aimed amounts, and keeps the level, as expected. To confirm the status is kept in the target level as expected, I collected the memory usage stat of the cgroups using memory.numa_stat file, after the stats are converged. I repeat the stat collection 42 times with 5 seconds delay between each of the collections. The results are as below: node0_memory_usage average stdev workload_a 2.79GiB 522.06MiB workload_b 5.15GiB 739.10MiB The average values are quite close to the targeted values: 27 GiB * 9.7% = 2.619 GiB for workload_a, and 27 GiB * 19.7% = 5.319 GiB. A level of variances are expected, given the overlap of the promotion/demotion targets, and dynamic data access pattern of the workloads. Give that, the measured variances are at a reasonable level. Patches Sequence ================ The first patch (patch 1) updates the kernel-doc comment of damos_quota_goal struct to clarify usage of optional fields of the struct, since later patches will add such optional fields. Following four patches (patches 2-5) implement a new DAMOS quota goal metric for per-cgroup per-node memory usage. Those extends the core layer interface for the new metric (patch 2), implement the metric value calculation on the core layer (patch 3), add DAMON sysfs interface file for the target cgroup specification (patch 4), and implement support of the new metric on DAMON sysfs interface (patch 5). Next two patches implment the second new DAMOS quota goal metric for per-cgroup per-node free (or, unused) memory. Those implement it in the core layer (patch 6) and DAMON sysfs interface (patch 7), extending the existing implementation for memory usage metric. Final three patches update the design (patch 8), the usage (patch 9), and the ABI (patch 10) documents for the changes that are introduced by this patch series. This patch (of 10): damos_quota_goal kerneldoc comment is not explaining when @metric is used. Update the comment for that. Link: https://lkml.kernel.org/r/20251017212706.183502-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251017212706.183502-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index cae8c613c5fc..dc9c310e0e75 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -176,6 +176,9 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually * entered by the user, probably inside the kdamond callbacks. Otherwise, * DAMON sets @current_value with self-measured value of @metric. + * + * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node + * id of the target node to account the used/free memory. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; -- cgit v1.2.3 From 6a18bbe48361acad1eae8d86aa47d353b1cfe619 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:54 -0700 Subject: mm/damon: add DAMOS quota goal type for per-memcg per-node memory usage Define a new DAMOS quota auto-tuning target metric for per-cgroup per-node memory usage. For specifying the cgroup of the interest, add a field, namely memcg_id, to damos_quota_goal struct. Note that this commit is only implementing the interface. The handling of the interface (the metric value calculation) will be implemented in the following commit. Link: https://lkml.kernel.org/r/20251017212706.183502-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index dc9c310e0e75..0d63ceb7e6ef 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -147,6 +147,7 @@ enum damos_action { * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. + * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -156,6 +157,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_SOME_MEM_PSI_US, DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, + DAMOS_QUOTA_NODE_MEMCG_USED_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -166,6 +168,7 @@ enum damos_quota_goal_metric { * @current_value: Current value of @metric. * @last_psi_total: Last measured total PSI * @nid: Node id. + * @memcg_id: Memcg id. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -179,6 +182,9 @@ enum damos_quota_goal_metric { * * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node * id of the target node to account the used/free memory. + * + * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents + * the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; @@ -187,7 +193,10 @@ struct damos_quota_goal { /* metric-dependent fields */ union { u64 last_psi_total; - int nid; + struct { + int nid; + unsigned short memcg_id; + }; }; struct list_head list; }; -- cgit v1.2.3 From 98fdce76fb7ed7070df21afbee46a4b36cb6a7c6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:58 -0700 Subject: mm/damon/core: add DAMOS quota gaol metric for per-memcg per-numa free memory Add a variant of DAMOS_QUOTA_NODE_MEMCG_USED_BP, for the free memory portion. The value of the metric is implemented as the entire memory of the given NUMA node subtracted by the given cgroup's usage. So from a perspective, "unused" could be a better term than "free". But arguably it is not very clear what is better, so use the term "free". Link: https://lkml.kernel.org/r/20251017212706.183502-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++-- mm/damon/core.c | 10 ++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0d63ceb7e6ef..0edf41d36ea1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -148,6 +148,7 @@ enum damos_action { * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. + * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -158,6 +159,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, DAMOS_QUOTA_NODE_MEMCG_USED_BP, + DAMOS_QUOTA_NODE_MEMCG_FREE_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -183,8 +185,8 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node * id of the target node to account the used/free memory. * - * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents - * the node id and the cgroup to account the used memory for. + * If @metric is DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP, @nid and @memcg_id + * represents the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; diff --git a/mm/damon/core.c b/mm/damon/core.c index 8aa8d269df90..a9c11d2d37b0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -790,6 +790,7 @@ static void damos_commit_quota_goal_union( dst->nid = src->nid; break; case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: dst->nid = src->nid; dst->memcg_id = src->memcg_id; break; @@ -2046,7 +2047,7 @@ static unsigned long damos_get_node_memcg_used_bp( { struct mem_cgroup *memcg; struct lruvec *lruvec; - unsigned long used_pages; + unsigned long used_pages, numerator; struct sysinfo i; rcu_read_lock(); @@ -2066,7 +2067,11 @@ static unsigned long damos_get_node_memcg_used_bp( used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE); si_meminfo_node(&i, goal->nid); - return used_pages * 10000 / i.totalram; + if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) + numerator = used_pages; + else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ + numerator = i.totalram - used_pages; + return numerator * 10000 / i.totalram; } #else static __kernel_ulong_t damos_get_node_mem_bp( @@ -2101,6 +2106,7 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) goal->current_value = damos_get_node_mem_bp(goal); break; case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: goal->current_value = damos_get_node_memcg_used_bp(goal); break; default: -- cgit v1.2.3 From e859a224fad65cb4848fe202aea9896a14fdb7f4 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Mon, 20 Oct 2025 21:01:24 +0800 Subject: mm/damon: add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() Patch series "mm/damon: fixes for address alignment issues in DAMON_LRU_SORT and DAMON_RECLAIM", v2. In DAMON_LRU_SORT and DAMON_RECLAIM, damon_set_regions() will apply DAMON_MIN_REGION as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT and DAMON_RECLAIM to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() and use it when calling damon_set_regions(), replacing the direct use of DAMON_MIN_REGION. This patch (of 2): In DAMON_LRU_SORT, damon_set_regions() will apply DAMON_MIN_REGION as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() and use it when calling damon_set_regions(), replacing the direct use of DAMON_MIN_REGION. Link: https://lkml.kernel.org/r/20251020130125.2875164-1-yanquanmin1@huawei.com Link: https://lkml.kernel.org/r/20251020130125.2875164-2-yanquanmin1@huawei.com Fixes: 2e0fe9245d6b ("mm/damon/lru_sort: support addr_unit for DAMON_LRU_SORT") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- mm/damon/core.c | 6 ++++-- mm/damon/lru_sort.c | 3 ++- mm/damon/reclaim.c | 3 ++- mm/damon/stat.c | 3 ++- 5 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0edf41d36ea1..9ee026c2db53 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -961,7 +961,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end); + unsigned long *start, unsigned long *end, + unsigned long min_sz_region); #endif /* CONFIG_DAMON */ diff --git a/mm/damon/core.c b/mm/damon/core.c index a9c11d2d37b0..82546d138a5a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2818,6 +2818,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. + * @min_sz_region: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the * values of @start and @end are zero, however, this function finds the biggest @@ -2828,7 +2829,8 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * Return: 0 on success, negative error code otherwise. */ int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end) + unsigned long *start, unsigned long *end, + unsigned long min_sz_region) { struct damon_addr_range addr_range; @@ -2841,7 +2843,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION); + return damon_set_regions(t, &addr_range, 1, min_sz_region); } /* diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 42b9a656f9de..49b4bc294f4e 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -242,7 +242,8 @@ static int damon_lru_sort_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, - &monitor_region_end); + &monitor_region_end, + param_ctx->min_sz_region); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 7ba3d0f9a19a..e30811cafe90 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -250,7 +250,8 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, - &monitor_region_end); + &monitor_region_end, + DAMON_MIN_REGION); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/stat.c b/mm/damon/stat.c index bf8626859902..ed8e3629d31a 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -188,7 +188,8 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (!target) goto free_out; damon_add_target(ctx, target); - if (damon_set_region_biggest_system_ram_default(target, &start, &end)) + if (damon_set_region_biggest_system_ram_default(target, &start, &end, + ctx->min_sz_region)) goto free_out; return ctx; free_out: -- cgit v1.2.3 From 54c58a2f5fa191839cf192fa4ebab39395272a3e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:21 +0100 Subject: mm: add vma_desc_size(), vma_desc_pages() helpers It's useful to be able to determine the size of a VMA descriptor range used on f_op->mmap_prepare, expressed both in bytes and pages, so add helpers for both and update code that could make use of it to do so. Link: https://lkml.kernel.org/r/74ef338203c9ff08a9ace73a8f1f6116a79112a0.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/ntfs3/file.c | 2 +- include/linux/mm.h | 10 ++++++++++ mm/secretmem.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 4c90ec2fa2ea..2f344e1ed756 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -332,7 +332,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) if (rw) { u64 to = min_t(loff_t, i_size_read(inode), - from + desc->end - desc->start); + from + vma_desc_size(desc)); if (is_sparsed(ni)) { /* Allocate clusters for rw map. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c79b3369b82..5752b0c516f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3583,6 +3583,16 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) +{ + return desc->end - desc->start; +} + +static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) +{ + return vma_desc_size(desc) >> PAGE_SHIFT; +} + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) diff --git a/mm/secretmem.c b/mm/secretmem.c index 9b0f5d9ec6f4..37f6d1097853 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -120,7 +120,7 @@ static int secretmem_release(struct inode *inode, struct file *file) static int secretmem_mmap_prepare(struct vm_area_desc *desc) { - const unsigned long len = desc->end - desc->start; + const unsigned long len = vma_desc_size(desc); if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; -- cgit v1.2.3 From 51e38e7d40d617965504f4dcba569ecf9302f245 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:24 +0100 Subject: mm: add remap_pfn_range_prepare(), remap_pfn_range_complete() We need the ability to split PFN remap between updating the VMA and performing the actual remap, in order to do away with the legacy f_op->mmap hook. To do so, update the PFN remap code to provide shared logic, and also make remap_pfn_range_notrack() static, as its one user, io_mapping_map_user() was removed in commit 9a4f90e24661 ("mm: remove mm/io-mapping.c"). Then, introduce remap_pfn_range_prepare(), which accepts VMA descriptor and PFN parameters, and remap_pfn_range_complete() which accepts the same parameters as remap_pfn_rangte(). remap_pfn_range_prepare() will set the cow vma->vm_pgoff if necessary, so it must be supplied with a correct PFN to do so. While we're here, also clean up the duplicated #ifdef __HAVE_PFNMAP_TRACKING check and put into a single #ifdef/#else block. We keep these internal to mm as they should only be used by internal helpers. Link: https://lkml.kernel.org/r/75b55de63249b3aa0fd5b3b08ed1d3ff19255d0d.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Acked-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 +++++++-- mm/internal.h | 4 ++ mm/memory.c | 132 +++++++++++++++++++++++++++++++++++------------------ 3 files changed, 110 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5752b0c516f2..ca5565f4fac4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -489,6 +489,21 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. + */ +#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) + /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) @@ -3634,10 +3649,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); -int remap_pfn_range(struct vm_area_struct *, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t); -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot); +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); diff --git a/mm/internal.h b/mm/internal.h index 56a9a714709a..5ca1e7842b19 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1677,4 +1677,8 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index f13b20b702f6..8e02b8d75535 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2900,6 +2900,25 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } +static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, + unsigned long end, unsigned long vm_start, unsigned long vm_end, + unsigned long pfn, pgoff_t *vm_pgoff_p) +{ + /* + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. + */ + if (is_cow_mapping(vm_flags)) { + if (addr != vm_start || end != vm_end) + return -EINVAL; + *vm_pgoff_p = pfn; + } + + return 0; +} + static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -2912,31 +2931,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - /* - * Physically remapped pages are special. Tell the - * rest of the world about it: - * VM_IO tells people not to look at these pages - * (accesses can have side effects). - * VM_PFNMAP tells the core MM that the base pages are just - * raw PFN mappings, and do not have a "struct page" associated - * with them. - * VM_DONTEXPAND - * Disable vma merging and expanding with mremap(). - * VM_DONTDUMP - * Omit vma from core dump, even when VM_IO turned off. - * - * There's a horrible special case to handle copy-on-write - * behaviour that some programs depend on. We mark the "original" - * un-COW'ed pages by matching them up with "vma->vm_pgoff". - * See vm_normal_page() for details. - */ - if (is_cow_mapping(vma->vm_flags)) { - if (addr != vma->vm_start || end != vma->vm_end) - return -EINVAL; - vma->vm_pgoff = pfn; - } - - vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -2957,7 +2952,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad * Variant of remap_pfn_range that does not call track_pfn_remap. The caller * must have pre-validated the caching bits of the pgprot_t. */ -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); @@ -3002,23 +2997,9 @@ void pfnmap_track_ctx_release(struct kref *ref) pfnmap_untrack(ctx->pfn, ctx->size); kfree(ctx); } -#endif /* __HAVE_PFNMAP_TRACKING */ -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target page aligned user address to start at - * @pfn: page frame number of kernel physical memory address - * @size: size of mapping area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - * - * Return: %0 on success, negative error code otherwise. - */ -#ifdef __HAVE_PFNMAP_TRACKING -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { struct pfnmap_track_ctx *ctx = NULL; int err; @@ -3054,15 +3035,78 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, return err; } +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_track(vma, addr, pfn, size, prot); +} #else -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { return remap_pfn_range_notrack(vma, addr, pfn, size, prot); } #endif + +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ + /* + * We set addr=VMA start, end=VMA end here, so this won't fail, but we + * check it again on complete and will fail there if specified addr is + * invalid. + */ + get_remap_pgoff(desc->vm_flags, desc->start, desc->end, + desc->start, desc->end, pfn, &desc->pgoff); + desc->vm_flags |= VM_REMAP_FLAGS; +} + +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size) +{ + unsigned long end = addr + PAGE_ALIGN(size); + int err; + + err = get_remap_pgoff(vma->vm_flags, addr, end, + vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); + if (err) + return err; + + vm_flags_set(vma, VM_REMAP_FLAGS); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = remap_pfn_range_prepare_vma(vma, addr, pfn, size); + if (err) + return err; + + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} EXPORT_SYMBOL(remap_pfn_range); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to -- cgit v1.2.3 From c707a68f9468e4ef4a3546b636a9dd088fe7b7f1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:25 +0100 Subject: mm: abstract io_remap_pfn_range() based on PFN The only instances in which we customise this function are ones in which we customise the PFN used. Instances where architectures were not passing the pgprot value through pgprot_decrypted() are ones where pgprot_decrypted() was a no-op anyway, so we can simply always pass pgprot through this function. Use this fact to simplify the use of io_remap_pfn_range(), by abstracting the PFN via io_remap_pfn_range_pfn() and using this instead of providing a general io_remap_pfn_range() function per-architecture. Link: https://lkml.kernel.org/r/d086191bf431b58ce3b231b4f4f555d080f60327.1760959442.git.lorenzo.stoakes@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/csky/include/asm/pgtable.h | 3 --- arch/mips/alchemy/common/setup.c | 9 +++++---- arch/mips/include/asm/pgtable.h | 5 ++--- arch/sparc/include/asm/pgtable_32.h | 12 ++++-------- arch/sparc/include/asm/pgtable_64.h | 12 ++++-------- include/linux/mm.h | 19 ++++++++++++++----- 6 files changed, 29 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 5a394be09c35..d606afbabce1 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -263,7 +263,4 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) -#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ - remap_pfn_range(vma, vaddr, pfn, size, prot) - #endif /* __ASM_CSKY_PGTABLE_H */ diff --git a/arch/mips/alchemy/common/setup.c b/arch/mips/alchemy/common/setup.c index a7a6d31a7a41..c35b4f809d51 100644 --- a/arch/mips/alchemy/common/setup.c +++ b/arch/mips/alchemy/common/setup.c @@ -94,12 +94,13 @@ phys_addr_t fixup_bigphys_addr(phys_addr_t phys_addr, phys_addr_t size) return phys_addr; } -int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { phys_addr_t phys_addr = fixup_bigphys_addr(pfn << PAGE_SHIFT, size); - return remap_pfn_range(vma, vaddr, phys_addr >> PAGE_SHIFT, size, prot); + return phys_addr >> PAGE_SHIFT; } -EXPORT_SYMBOL(io_remap_pfn_range); +EXPORT_SYMBOL(io_remap_pfn_range_pfn); + #endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index ae73ecf4c41a..9c06a612d33a 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -604,9 +604,8 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, */ #ifdef CONFIG_MIPS_FIXUP_BIGPHYS_ADDR phys_addr_t fixup_bigphys_addr(phys_addr_t addr, phys_addr_t size); -int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr, - unsigned long pfn, unsigned long size, pgprot_t prot); -#define io_remap_pfn_range io_remap_pfn_range +unsigned long io_remap_pfn_range_pfn(unsigned long pfn, unsigned long size); +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn #else #define fixup_bigphys_addr(addr, size) (addr) #endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */ diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index f1538a48484a..a9f802d1dd64 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -395,12 +395,8 @@ __get_iospace (unsigned long addr) #define GET_IOSPACE(pfn) (pfn >> (BITS_PER_LONG - 4)) #define GET_PFN(pfn) (pfn & 0x0fffffffUL) -int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, - unsigned long, pgprot_t); - -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { unsigned long long offset, space, phys_base; @@ -408,9 +404,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, space = GET_IOSPACE(pfn); phys_base = offset | (space << 32ULL); - return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot); + return phys_base >> PAGE_SHIFT; } -#define io_remap_pfn_range io_remap_pfn_range +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 64b85ff9c766..615f460c50af 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1048,9 +1048,6 @@ int page_in_phys_avail(unsigned long paddr); #define GET_IOSPACE(pfn) (pfn >> (BITS_PER_LONG - 4)) #define GET_PFN(pfn) (pfn & 0x0fffffffffffffffUL) -int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, - unsigned long, pgprot_t); - void adi_restore_tags(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte); @@ -1084,9 +1081,8 @@ static inline int arch_unmap_one(struct mm_struct *mm, return 0; } -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; int space = GET_IOSPACE(pfn); @@ -1094,9 +1090,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, phys_base = offset | (((unsigned long) space) << 32UL); - return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot); + return phys_base >> PAGE_SHIFT; } -#define io_remap_pfn_range io_remap_pfn_range +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn static inline unsigned long __untagged_addr(unsigned long start) { diff --git a/include/linux/mm.h b/include/linux/mm.h index ca5565f4fac4..4441ceec913f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3684,15 +3684,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -#ifndef io_remap_pfn_range -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, - unsigned long size, pgprot_t prot) +#ifndef io_remap_pfn_range_pfn +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { - return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); + return pfn; } #endif +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long orig_pfn, + unsigned long size, pgprot_t orig_prot) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + const pgprot_t prot = pgprot_decrypted(orig_prot); + + return remap_pfn_range(vma, addr, pfn, size, prot); +} + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) -- cgit v1.2.3 From ac0a3fc9c07df79dc8a4ce9d274df00afc7bf12d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:27 +0100 Subject: mm: add ability to take further action in vm_area_desc Some drivers/filesystems need to perform additional tasks after the VMA is set up. This is typically in the form of pre-population. The forms of pre-population most likely to be performed are a PFN remap or the insertion of normal folios and PFNs into a mixed map. We start by implementing the PFN remap functionality, ensuring that we perform the appropriate actions at the appropriate time - that is setting flags at the point of .mmap_prepare, and performing the actual remap at the point at which the VMA is fully established. This prevents the driver from doing anything too crazy with a VMA at any stage, and we retain complete control over how the mm functionality is applied. Unfortunately callers still do often require some kind of custom action, so we add an optional success/error _hook to allow the caller to do something after the action has succeeded or failed. This is done at the point when the VMA has already been established, so the harm that can be done is limited. The error hook can be used to filter errors if necessary. There may be cases in which the caller absolutely must hold the file rmap lock until the operation is entirely complete. It is an edge case, but certainly the hugetlbfs mmap hook requires it. To accommodate this, we add the hide_from_rmap_until_complete flag to the mmap_action type. In this case, if a new VMA is allocated, we will hold the file rmap lock until the operation is entirely completed (including any success/error hooks). Note that we do not need to update __compat_vma_mmap() to accommodate this flag, as this function will be invoked from an .mmap handler whose VMA is not yet visible, so we implicitly hide it from the rmap. If any error arises on these final actions, we simply unmap the VMA altogether. Also update the stacked filesystem compatibility layer to utilise the action behaviour, and update the VMA tests accordingly. While we're here, rename __compat_vma_mmap_prepare() to __compat_vma_mmap() as we are now performing actions invoked by the mmap_prepare in addition to just the mmap_prepare hook. Link: https://lkml.kernel.org/r/2601199a7b2eaeadfcd8ab6e199c6d1706650c94.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/fs.h | 6 +- include/linux/mm.h | 74 ++++++++++++++++++++ include/linux/mm_types.h | 53 ++++++++++++++ mm/util.c | 146 ++++++++++++++++++++++++++++++++++++--- mm/vma.c | 113 ++++++++++++++++++++++-------- tools/testing/vma/vma_internal.h | 98 ++++++++++++++++++++++++-- 6 files changed, 441 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..8cf9547a881c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2393,14 +2393,14 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma); -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4441ceec913f..2d060081caa5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3608,6 +3608,80 @@ static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) return vma_desc_size(desc) >> PAGE_SHIFT; } +/** + * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN + * remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_remap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + /* [start, start + size) must be within the VMA. */ + WARN_ON_ONCE(start < desc->start || start >= desc->end); + WARN_ON_ONCE(start + size > desc->end); + + action->type = MMAP_REMAP_PFN; + action->remap.start = start; + action->remap.start_pfn = start_pfn; + action->remap.size = size; + action->remap.pgprot = desc->page_prot; +} + +/** + * mmap_action_remap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_remap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +/** + * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN + * I/O remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_ioremap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + mmap_action_remap(desc, start, start_pfn, size); + desc->action.type = MMAP_IO_REMAP_PFN; +} + +/** + * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN I/O remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc); +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma); + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..5021047485a9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -773,6 +773,56 @@ struct pfnmap_track_ctx { }; #endif +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -796,6 +846,9 @@ struct vm_area_desc { /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; /* diff --git a/mm/util.c b/mm/util.c index 8989d5767528..97cae40c0209 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1135,7 +1135,7 @@ EXPORT_SYMBOL(flush_dcache_folio); #endif /** - * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() + * __compat_vma_mmap() - See description for compat_vma_mmap() * for details. This is the same operation, only with a specific file operations * struct which may or may not be the same as vma->vm_file->f_op. * @f_op: The file operations whose .mmap_prepare() hook is specified. @@ -1143,7 +1143,7 @@ EXPORT_SYMBOL(flush_dcache_folio); * @vma: The VMA to apply the .mmap_prepare() hook to. * Returns: 0 on success or error. */ -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { @@ -1156,21 +1156,24 @@ int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -EXPORT_SYMBOL(__compat_vma_mmap_prepare); +EXPORT_SYMBOL(__compat_vma_mmap); /** - * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an - * existing VMA. + * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an + * existing VMA and execute any requested actions. * @file: The file which possesss an f_op->mmap_prepare() hook. * @vma: The VMA to apply the .mmap_prepare() hook to. * @@ -1185,7 +1188,7 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); * .mmap_prepare() hook, as we are in a different context when we invoke the * .mmap() hook, already having a VMA to deal with. * - * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, + * compat_vma_mmap() is a compatibility function that takes VMA state, * establishes a struct vm_area_desc descriptor, passes to the underlying * .mmap_prepare() hook and applies any changes performed by it. * @@ -1194,11 +1197,11 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); * * Returns: 0 on success or error. */ -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } -EXPORT_SYMBOL(compat_vma_mmap_prepare); +EXPORT_SYMBOL(compat_vma_mmap); static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) @@ -1280,6 +1283,127 @@ again: } } +static int mmap_action_finish(struct mmap_action *action, + const struct vm_area_struct *vma, int err) +{ + /* + * If an error occurs, unmap the VMA altogether and return an error. We + * only clear the newly allocated VMA, since this function is only + * invoked if we do NOT merge, so we only clean up the VMA we created. + */ + if (err) { + const size_t len = vma_pages(vma) << PAGE_SHIFT; + + do_munmap(current->mm, vma->vm_start, len, NULL); + + if (action->error_hook) { + /* We may want to filter the error. */ + err = action->error_hook(err); + + /* The caller should not clear the error. */ + VM_WARN_ON_ONCE(!err); + } + return err; + } + + if (action->success_hook) + return action->success_hook(vma); + + return 0; +} + +#ifdef CONFIG_MMU +/** + * mmap_action_prepare - Perform preparatory setup for an VMA descriptor + * action which need to be performed. + * @desc: The VMA descriptor to prepare for @action. + * @action: The action to perform. + */ +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + remap_pfn_range_prepare(desc, action->remap.start_pfn); + break; + case MMAP_IO_REMAP_PFN: + io_remap_pfn_range_prepare(desc, action->remap.start_pfn, + action->remap.size); + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +/** + * mmap_action_complete - Execute VMA descriptor action. + * @action: The action to perform. + * @vma: The VMA to perform the action upon. + * + * Similar to mmap_action_prepare(). + * + * Return: 0 on success, or error, at which point the VMA will be unmapped. + */ +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + err = remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + case MMAP_IO_REMAP_PFN: + err = io_remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#else +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle these. */ + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle this. */ + + err = -EINVAL; + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#endif + #ifdef CONFIG_MMU /** * folio_pte_batch - detect a PTE batch for a large folio diff --git a/mm/vma.c b/mm/vma.c index eb2f711c03a1..919d1fc63a52 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -34,7 +34,9 @@ struct mmap_state { struct maple_tree mt_detach; /* Determine if we can check KSM flags early in mmap() logic. */ - bool check_ksm_early; + bool check_ksm_early :1; + /* If we map new, hold the file rmap lock on mapping. */ + bool hold_file_rmap_lock :1; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ @@ -1754,7 +1756,7 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) unlink_file_vma_batch_process(vb); } -static void vma_link_file(struct vm_area_struct *vma) +static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock) { struct file *file = vma->vm_file; struct address_space *mapping; @@ -1763,7 +1765,8 @@ static void vma_link_file(struct vm_area_struct *vma) mapping = file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); + if (!hold_rmap_lock) + i_mmap_unlock_write(mapping); } } @@ -1777,7 +1780,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) vma_start_write(vma); vma_iter_store_new(&vmi, vma); - vma_link_file(vma); + vma_link_file(vma, /* hold_rmap_lock= */false); mm->map_count++; validate_mm(mm); return 0; @@ -2311,17 +2314,33 @@ static void update_ksm_flags(struct mmap_state *map) map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); } +static void set_desc_from_map(struct vm_area_desc *desc, + const struct mmap_state *map) +{ + desc->start = map->addr; + desc->end = map->end; + + desc->pgoff = map->pgoff; + desc->vm_file = map->file; + desc->vm_flags = map->vm_flags; + desc->page_prot = map->page_prot; +} + /* * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * + * As a result it sets up the @map and @desc objects. + * * @map: Mapping state. + * @desc: VMA descriptor * @uf: Userfaultfd context list. * * Returns: 0 on success, error code otherwise. */ -static int __mmap_setup(struct mmap_state *map, struct list_head *uf) +static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, + struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; @@ -2378,6 +2397,7 @@ static int __mmap_setup(struct mmap_state *map, struct list_head *uf) */ vms_clean_up_area(vms, &map->mas_detach); + set_desc_from_map(desc, map); return 0; } @@ -2479,7 +2499,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; - vma_link_file(vma); + vma_link_file(vma, map->hold_file_rmap_lock); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below @@ -2539,6 +2559,17 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } +static void call_action_prepare(struct mmap_state *map, + struct vm_area_desc *desc) +{ + struct mmap_action *action = &desc->action; + + mmap_action_prepare(action, desc); + + if (action->hide_from_rmap_until_complete) + map->hold_file_rmap_lock = true; +} + /* * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that * specifies it. @@ -2550,34 +2581,26 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * * Returns 0 on success, or an error code otherwise. */ -static int call_mmap_prepare(struct mmap_state *map) +static int call_mmap_prepare(struct mmap_state *map, + struct vm_area_desc *desc) { int err; - struct vm_area_desc desc = { - .mm = map->mm, - .file = map->file, - .start = map->addr, - .end = map->end, - - .pgoff = map->pgoff, - .vm_file = map->file, - .vm_flags = map->vm_flags, - .page_prot = map->page_prot, - }; /* Invoke the hook. */ - err = vfs_mmap_prepare(map->file, &desc); + err = vfs_mmap_prepare(map->file, desc); if (err) return err; + call_action_prepare(map, desc); + /* Update fields permitted to be changed. */ - map->pgoff = desc.pgoff; - map->file = desc.vm_file; - map->vm_flags = desc.vm_flags; - map->page_prot = desc.page_prot; + map->pgoff = desc->pgoff; + map->file = desc->vm_file; + map->vm_flags = desc->vm_flags; + map->page_prot = desc->page_prot; /* User-defined fields. */ - map->vm_ops = desc.vm_ops; - map->vm_private_data = desc.private_data; + map->vm_ops = desc->vm_ops; + map->vm_private_data = desc->private_data; return 0; } @@ -2619,22 +2642,48 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) return false; } +static int call_action_complete(struct mmap_state *map, + struct vm_area_desc *desc, + struct vm_area_struct *vma) +{ + struct mmap_action *action = &desc->action; + int ret; + + ret = mmap_action_complete(action, vma); + + /* If we held the file rmap we need to release it. */ + if (map->hold_file_rmap_lock) { + struct file *file = vma->vm_file; + + i_mmap_unlock_write(file->f_mapping); + } + return ret; +} + static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - int error; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + struct vm_area_desc desc = { + .mm = mm, + .file = file, + .action = { + .type = MMAP_NOTHING, /* Default to no further action. */ + }, + }; + bool allocated_new = false; + int error; map.check_ksm_early = can_set_ksm_flags_early(&map); - error = __mmap_setup(&map, uf); + error = __mmap_setup(&map, &desc, uf); if (!error && have_mmap_prepare) - error = call_mmap_prepare(&map); + error = call_mmap_prepare(&map, &desc); if (error) goto abort_munmap; @@ -2653,6 +2702,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, error = __mmap_new_vma(&map, &vma); if (error) goto unacct_error; + allocated_new = true; } if (have_mmap_prepare) @@ -2660,6 +2710,13 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); + if (have_mmap_prepare && allocated_new) { + error = call_action_complete(&map, &desc, vma); + + if (error) + return error; + } + return addr; /* Accounting was done by __mmap_setup(). */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index dc976a285ad2..d873667704e8 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -275,6 +275,57 @@ struct mm_struct { struct vm_area_struct; + +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -298,6 +349,9 @@ struct vm_area_desc { /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; struct file_operations { @@ -1326,12 +1380,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, +static inline void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ +} + +static inline int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + return 0; +} + +static inline int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, - .file = vma->vm_file, + .file = file, .start = vma->vm_start, .end = vma->vm_end, @@ -1339,21 +1404,24 @@ static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -static inline int compat_vma_mmap_prepare(struct file *file, +static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } /* Did the driver provide valid mmap hook configuration? */ @@ -1374,7 +1442,7 @@ static inline bool can_mmap_file(struct file *file) static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } @@ -1407,4 +1475,20 @@ static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, return vm_flags; } +static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ +} + +static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot) +{ + return 0; +} + +static inline int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf) +{ + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From ea52cb24cd3fb121283754ab82b2cb3044609359 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:29 +0100 Subject: mm/hugetlbfs: update hugetlbfs to use mmap_prepare Since we can now perform actions after the VMA is established via mmap_prepare, use desc->action_success_hook to set up the hugetlb lock once the VMA is setup. We also make changes throughout hugetlbfs to make this possible. Note that we must hide newly established hugetlb VMAs from the rmap until the operation is entirely complete as we establish a hugetlb lock during VMA setup that can be raced by rmap users. Link: https://lkml.kernel.org/r/b1afa16d3cfa585a03df9ae215ae9f905b3f0ed7.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Tested-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 46 ++++++++++++++++++------- include/linux/hugetlb.h | 9 +++-- include/linux/hugetlb_inline.h | 15 +++++--- mm/hugetlb.c | 77 ++++++++++++++++++++++++------------------ 4 files changed, 95 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ce8e40d35032..3919fca56553 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) { + /* Unfortunate we have to reassign vma->vm_private_data. */ + return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); +} + +static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) +{ + struct file *file = desc->file; struct inode *inode = file_inode(file); loff_t len, vma_len; int ret; @@ -112,8 +119,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); - vma->vm_ops = &hugetlb_vm_ops; + desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + desc->vm_ops = &hugetlb_vm_ops; /* * page based offset in vm_pgoff could be sufficiently large to @@ -122,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { - if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + if (desc->pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ - if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); - len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + vma_len = (loff_t)vma_desc_size(desc); + len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; @@ -141,7 +148,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; - vm_flags = vma->vm_flags; + vm_flags = desc->vm_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode @@ -151,17 +158,30 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags |= VM_NORESERVE; if (hugetlb_reserve_pages(inode, - vma->vm_pgoff >> huge_page_order(h), - len >> huge_page_shift(h), vma, - vm_flags) < 0) + desc->pgoff >> huge_page_order(h), + len >> huge_page_shift(h), desc, + vm_flags) < 0) goto out; ret = 0; - if (vma->vm_flags & VM_WRITE && inode->i_size < len) + if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); + if (!ret) { + /* Allocate the VMA lock after we set it up. */ + desc->action.success_hook = hugetlb_file_mmap_prepare_success; + /* + * We cannot permit the rmap finding this VMA in the time + * between the VMA being inserted into the VMA tree and the + * completion/success hook being invoked. + * + * This is because we establish a per-VMA hugetlb lock which can + * be raced by rmap. + */ + desc->action.hide_from_rmap_until_complete = true; + } return ret; } @@ -1220,7 +1240,7 @@ static void init_once(void *foo) static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, - .mmap = hugetlbfs_file_mmap, + .mmap_prepare = hugetlbfs_file_mmap_prepare, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8e63e46b8e1f..2387513d6ae5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags); + struct vm_area_desc *desc, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -280,6 +279,7 @@ bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -466,6 +466,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} +static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + return 0; +} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 0660a03d37d9..a27aa0162918 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -2,22 +2,27 @@ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H -#ifdef CONFIG_HUGETLB_PAGE - #include -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +#ifdef CONFIG_HUGETLB_PAGE + +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { - return !!(vma->vm_flags & VM_HUGETLB); + return !!(vm_flags & VM_HUGETLB); } #else -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { return false; } #endif +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return is_vm_hugetlb_flags(vma->vm_flags); +} + #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7774c286b3b7..86e672fcb305 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -119,7 +119,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); @@ -438,17 +437,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma) } } -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +/* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) - return; + return 0; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) - return; + return -EINVAL; vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { @@ -463,13 +466,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); - return; + return -EINVAL; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; + + return 0; } /* Helper that removes a struct file_region from the resv_map cache and returns @@ -1201,20 +1206,28 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) } } -static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); + VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); - set_vma_private_data(vma, (unsigned long)map); + set_vma_private_data(vma, get_vma_private_data(vma) | flags); } -static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); - set_vma_private_data(vma, get_vma_private_data(vma) | flags); + desc->private_data = map; +} + +static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + + desc->private_data = (void *)((unsigned long)desc->private_data | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) @@ -1224,6 +1237,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } +static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + + return ((unsigned long)desc->private_data) & flag; +} + bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && @@ -7270,9 +7290,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, */ long hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) + long from, long to, + struct vm_area_desc *desc, + vm_flags_t vm_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); @@ -7287,12 +7307,6 @@ long hugetlb_reserve_pages(struct inode *inode, return -EINVAL; } - /* - * vma specific semaphore used for pmd sharing and fault/truncation - * synchronization - */ - hugetlb_vma_lock_alloc(vma); - /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page @@ -7305,9 +7319,9 @@ long hugetlb_reserve_pages(struct inode *inode, * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be - * called to make the mapping read-write. Assume !vma is a shm mapping + * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -7324,8 +7338,8 @@ long hugetlb_reserve_pages(struct inode *inode, chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + set_vma_desc_resv_map(desc, resv_map); + set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); } if (chg < 0) @@ -7335,7 +7349,7 @@ long hugetlb_reserve_pages(struct inode *inode, chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; - if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { + if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -7369,7 +7383,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -7423,16 +7437,15 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - hugetlb_vma_lock_free(vma); - if (!vma || vma->vm_flags & VM_MAYSHARE) + if (!desc || desc->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); - if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); - set_vma_resv_map(vma, NULL); + set_vma_desc_resv_map(desc, NULL); } return chg < 0 ? chg : add < 0 ? add : -EINVAL; } -- cgit v1.2.3 From 89646d9c748c0902600090f37ae585f3b99deb4d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:30 +0100 Subject: mm: add shmem_zero_setup_desc() Add the ability to set up a shared anonymous mapping based on a VMA descriptor rather than a VMA. This is a prerequisite for converting to the char mm driver to use the mmap_prepare hook. Link: https://lkml.kernel.org/r/d9181517a7e3d6b014a5697c6990d3722c2c9fcd.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 3 ++- mm/shmem.c | 41 +++++++++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0e47465ef0fd..5b368f9549d6 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,7 +94,8 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); -extern int shmem_zero_setup(struct vm_area_struct *); +int shmem_zero_setup(struct vm_area_struct *vma); +int shmem_zero_setup_desc(struct vm_area_desc *desc); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); diff --git a/mm/shmem.c b/mm/shmem.c index 8b9fcdd144c8..da1df4270309 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5880,14 +5880,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); -/** - * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap - */ -int shmem_zero_setup(struct vm_area_struct *vma) +static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags) { - struct file *file; - loff_t size = vma->vm_end - vma->vm_start; + loff_t size = end - start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict @@ -5895,7 +5890,18 @@ int shmem_zero_setup(struct vm_area_struct *vma) * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); + return shmem_kernel_file_setup("dev/zero", size, vm_flags); +} + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap + * Returns: 0 on success, or error + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags); + if (IS_ERR(file)) return PTR_ERR(file); @@ -5907,6 +5913,25 @@ int shmem_zero_setup(struct vm_area_struct *vma) return 0; } +/** + * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA + * descriptor for convenience. + * @desc: Describes VMA + * Returns: 0 on success, or error + */ +int shmem_zero_setup_desc(struct vm_area_desc *desc) +{ + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags); + + if (IS_ERR(file)) + return PTR_ERR(file); + + desc->vm_file = file; + desc->vm_ops = &shmem_anon_vm_ops; + + return 0; +} + /** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space -- cgit v1.2.3 From 5ff592bec75ad79ed7f1a817477ab6eef8dc5efc Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 21 Oct 2025 16:44:25 -0700 Subject: memcg: manually uninline __memcg_memory_event __memcg_memory_event() has been unnecessarily marked inline even when it is not really performance critical. It is usually called to track extreme conditions. Over the time, it has evolved to include more functionality and inlining it is causing more harm. Before the patch: $ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o text data bss dec hex filename 35645 10574 4192 50411 c4eb mm/memcontrol.o 54738 1658 0 56396 dc4c net/ipv4/tcp_input.o 34644 1065 0 35709 8b7d net/ipv4/tcp_output.o After the patch: $ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o text data bss dec hex filename 35137 10446 4192 49775 c26f mm/memcontrol.o 54322 1562 0 55884 da4c net/ipv4/tcp_input.o 34492 1017 0 35509 8ab5 net/ipv4/tcp_output.o [akpm@linux-foundation.org: use EXPORT_SYMBOL_GPL for __memcg_memory_event, per Michal and Christoph] Link: https://lkml.kernel.org/r/20251021234425.1885471-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: SeongJae Park Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 32 ++------------------------------ mm/memcontrol.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5fe254813123..8c0f15e5978f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1002,36 +1002,8 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, count_memcg_events_mm(mm, idx, 1); } -static inline void __memcg_memory_event(struct mem_cgroup *memcg, - enum memcg_memory_event event, - bool allow_spinning) -{ - bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || - event == MEMCG_SWAP_FAIL; - - /* For now only MEMCG_MAX can happen with !allow_spinning context. */ - VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); - - atomic_long_inc(&memcg->memory_events_local[event]); - if (!swap_event && allow_spinning) - cgroup_file_notify(&memcg->events_local_file); - - do { - atomic_long_inc(&memcg->memory_events[event]); - if (allow_spinning) { - if (swap_event) - cgroup_file_notify(&memcg->swap_events_file); - else - cgroup_file_notify(&memcg->events_file); - } - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - break; - if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) - break; - } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); -} +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning); static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 976412c8196e..025da46d9959 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1626,6 +1626,37 @@ unsigned long mem_cgroup_size(struct mem_cgroup *memcg) return page_counter_read(&memcg->memory); } +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning) +{ + bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || + event == MEMCG_SWAP_FAIL; + + /* For now only MEMCG_MAX can happen with !allow_spinning context. */ + VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); + + atomic_long_inc(&memcg->memory_events_local[event]); + if (!swap_event && allow_spinning) + cgroup_file_notify(&memcg->events_local_file); + + do { + atomic_long_inc(&memcg->memory_events[event]); + if (allow_spinning) { + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); + } + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + break; + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + break; + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); +} +EXPORT_SYMBOL_GPL(__memcg_memory_event); + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { -- cgit v1.2.3 From 27bfafac65d87c58639f5d7af1353ec1e7886963 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:28 +0800 Subject: mm: add a ptdesc flag to mark kernel page tables The page tables used to map the kernel and userspace often have very different handling rules. There are frequently *_kernel() variants of functions just for kernel page tables. That's not great and has lead to code duplication. Instead of having completely separate call paths, allow a 'ptdesc' to be marked as being for kernel mappings. Introduce helpers to set and clear this status. Note: this uses the PG_referenced bit. Page flags are a great fit for this since it is truly a single bit of information. Use PG_referenced itself because it's a fairly benign flag (as opposed to things like PG_lock). It's also (according to Willy) unlikely to go away any time soon. PG_referenced is not in PAGE_FLAGS_CHECK_AT_FREE. It does not need to be cleared before freeing the page, and pages coming out of the allocator should have it cleared. Regardless, introduce an API to clear it anyway. Having symmetry in the API makes it easier to change the underlying implementation later, like if there was a need to move to a PAGE_FLAGS_CHECK_AT_FREE bit. Link: https://lkml.kernel.org/r/20251022082635.2462433-3-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2d060081caa5..5c887c4ea29e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2962,6 +2962,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #endif /* CONFIG_MMU */ enum pt_flags { + PT_kernel = PG_referenced, PT_reserved = PG_reserved, /* High bits are used for zone/node/section */ }; @@ -2987,6 +2988,46 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt) return test_bit(PT_reserved, &pt->pt_flags.f); } +/** + * ptdesc_set_kernel - Mark a ptdesc used to map the kernel + * @ptdesc: The ptdesc to be marked + * + * Kernel page tables often need special handling. Set a flag so that + * the handling code knows this ptdesc will not be used for userspace. + */ +static inline void ptdesc_set_kernel(struct ptdesc *ptdesc) +{ + set_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel + * @ptdesc: The ptdesc to be unmarked + * + * Use when the ptdesc is no longer used to map the kernel and no longer + * needs special handling. + */ +static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) +{ + /* + * Note: the 'PG_referenced' bit does not strictly need to be + * cleared before freeing the page. But this is nice for + * symmetry. + */ + clear_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel + * @ptdesc: The ptdesc being tested + * + * Call to tell if the ptdesc used to map the kernel. + */ +static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) +{ + return test_bit(PT_kernel, &ptdesc->pt_flags.f); +} + /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags -- cgit v1.2.3 From 977870522af34359b461060597ee3a86f27450d6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:29 +0800 Subject: mm: actually mark kernel page table pages Now that the API is in place, mark kernel page table pages just after they are allocated. Unmark them just before they are freed. Note: Unconditionally clearing the 'kernel' marking (via ptdesc_clear_kernel()) would be functionally identical to what is here. But having the if() makes it logically clear that this function can be used for kernel and non-kernel page tables. Link: https://lkml.kernel.org/r/20251022082635.2462433-4-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 18 ++++++++++++++++++ include/linux/mm.h | 3 +++ 2 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 3c8ec3bfea44..b9d2a7c79b93 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -28,6 +28,8 @@ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) return NULL; } + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) @@ -146,6 +148,10 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad pagetable_free(ptdesc); return NULL; } + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) @@ -179,6 +185,10 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long return NULL; pagetable_pud_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__)) @@ -233,6 +243,10 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long return NULL; pagetable_p4d_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) @@ -277,6 +291,10 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order return NULL; pagetable_pgd_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c887c4ea29e..8f46048875a7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,6 +3057,9 @@ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); + if (ptdesc_test_kernel(pt)) + ptdesc_clear_kernel(pt); + __free_pages(page, compound_order(page)); } -- cgit v1.2.3 From 01894295672335ff304beed4359f30d14d5765f2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:31 +0800 Subject: mm: introduce pure page table freeing function The pages used for ptdescs are currently freed back to the allocator in a single location. They will shortly be freed from a second location. Create a simple helper that just frees them back to the allocator. Link: https://lkml.kernel.org/r/20251022082635.2462433-6-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f46048875a7..88c0a0fae43a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3046,6 +3046,13 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) +static inline void __pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3055,12 +3062,10 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde */ static inline void pagetable_free(struct ptdesc *pt) { - struct page *page = ptdesc_page(pt); - if (ptdesc_test_kernel(pt)) ptdesc_clear_kernel(pt); - __free_pages(page, compound_order(page)); + __pagetable_free(pt); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) -- cgit v1.2.3 From 5ba2f0a1556479638ac11a3c201421f5515e89f5 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:33 +0800 Subject: mm: introduce deferred freeing for kernel page tables This introduces a conditional asynchronous mechanism, enabled by CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the freeing of pages that are used as page tables for kernel address mappings. These pages are now queued to a work struct instead of being freed immediately. This deferred freeing allows for batch-freeing of page tables, providing a safe context for performing a single expensive operation (TLB flush) for a batch of kernel page tables instead of performing that expensive operation for each page table. Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +++++++++++++--- mm/Kconfig | 3 +++ mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 88c0a0fae43a..a6fd9f5aaf30 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3053,6 +3053,14 @@ static inline void __pagetable_free(struct ptdesc *pt) __free_pages(page, compound_order(page)); } +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +void pagetable_free_kernel(struct ptdesc *pt); +#else +static inline void pagetable_free_kernel(struct ptdesc *pt) +{ + __pagetable_free(pt); +} +#endif /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3062,10 +3070,12 @@ static inline void __pagetable_free(struct ptdesc *pt) */ static inline void pagetable_free(struct ptdesc *pt) { - if (ptdesc_test_kernel(pt)) + if (ptdesc_test_kernel(pt)) { ptdesc_clear_kernel(pt); - - __pagetable_free(pt); + pagetable_free_kernel(pt); + } else { + __pagetable_free(pt); + } } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) diff --git a/mm/Kconfig b/mm/Kconfig index 4971436c8697..682a5c39a1a6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -906,6 +906,9 @@ config HAVE_GIGANTIC_FOLIOS def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \ (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +config ASYNC_KERNEL_PGTABLE_FREE + def_bool n + # TODO: Allow to be enabled without THP config ARCH_SUPPORTS_HUGE_PFNMAP def_bool n diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 567e2d084071..1c7caa8ef164 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -406,3 +406,40 @@ again: pte_unmap_unlock(pte, ptl); goto again; } + +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +static void kernel_pgtable_work_func(struct work_struct *work); + +static struct { + struct list_head list; + /* protect above ptdesc lists */ + spinlock_t lock; + struct work_struct work; +} kernel_pgtable_work = { + .list = LIST_HEAD_INIT(kernel_pgtable_work.list), + .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock), + .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func), +}; + +static void kernel_pgtable_work_func(struct work_struct *work) +{ + struct ptdesc *pt, *next; + LIST_HEAD(page_list); + + spin_lock(&kernel_pgtable_work.lock); + list_splice_tail_init(&kernel_pgtable_work.list, &page_list); + spin_unlock(&kernel_pgtable_work.lock); + + list_for_each_entry_safe(pt, next, &page_list, pt_list) + __pagetable_free(pt); +} + +void pagetable_free_kernel(struct ptdesc *pt) +{ + spin_lock(&kernel_pgtable_work.lock); + list_add(&pt->pt_list, &kernel_pgtable_work.list); + spin_unlock(&kernel_pgtable_work.lock); + + schedule_work(&kernel_pgtable_work.work); +} +#endif -- cgit v1.2.3 From e37d5a2d60a338c5917c45296bac65da1382eda5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 22 Oct 2025 16:26:34 +0800 Subject: iommu/sva: invalidate stale IOTLB entries for kernel address space Introduce a new IOMMU interface to flush IOTLB paging cache entries for the CPU kernel address space. This interface is invoked from the x86 architecture code that manages combined user and kernel page tables, specifically before any kernel page table page is freed and reused. This addresses the main issue with vfree() which is a common occurrence and can be triggered by unprivileged users. While this resolves the primary problem, it doesn't address some extremely rare case related to memory unplug of memory that was present as reserved memory at boot, which cannot be triggered by unprivileged users. The discussion can be found at the link below. Enable SVA on x86 architecture since the IOMMU can now receive notification to flush the paging cache before freeing the CPU kernel page table pages. Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/ Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Suggested-by: Jann Horn Reviewed-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Hansen Cc: David Hildenbrand Cc: Ingo Molnar Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + drivers/iommu/iommu-sva.c | 32 ++++++++++++++++++++++++++++---- include/linux/iommu.h | 4 ++++ mm/pgtable-generic.c | 2 ++ 4 files changed, 35 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fa3b616af03a..a3700766a8c0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -279,6 +279,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index a0442faad952..d236aef80a8d 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,8 @@ #include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); +static bool iommu_sva_present; +static LIST_HEAD(iommu_sva_mms); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); @@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid; + iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /* * Make sure the write to mm->iommu_mm is not reordered in front of @@ -77,9 +80,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (!group) return ERR_PTR(-ENODEV); - if (IS_ENABLED(CONFIG_X86)) - return ERR_PTR(-EOPNOTSUPP); - mutex_lock(&iommu_sva_lock); /* Allocate mm->pasid if necessary. */ @@ -135,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1; - list_add(&domain->next, &mm->iommu_mm->sva_domains); + if (list_empty(&iommu_mm->sva_domains)) { + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = true; + list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); + } + list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -178,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); } + + if (list_empty(&iommu_mm->sva_domains)) { + list_del(&iommu_mm->mm_list_elm); + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = false; + } + mutex_unlock(&iommu_sva_lock); kfree(handle); } @@ -315,3 +327,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, return domain; } + +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{ + struct iommu_mm_data *iommu_mm; + + guard(mutex)(&iommu_sva_lock); + if (!iommu_sva_present) + return; + + list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) + mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); +} diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c30d12e16473..66e4abb2df0d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1134,7 +1134,9 @@ struct iommu_sva { struct iommu_mm_data { u32 pasid; + struct mm_struct *mm; struct list_head sva_domains; + struct list_head mm_list_elm; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1615,6 +1617,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1639,6 +1642,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */ #ifdef CONFIG_IOMMU_IOPF diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 1c7caa8ef164..8c22be79b734 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -430,6 +431,7 @@ static void kernel_pgtable_work_func(struct work_struct *work) list_splice_tail_init(&kernel_pgtable_work.list, &page_list); spin_unlock(&kernel_pgtable_work.lock); + iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); list_for_each_entry_safe(pt, next, &page_list, pt_list) __pagetable_free(pt); } -- cgit v1.2.3 From a983471cfc454afeba23526ee5d17fd8cdc7876f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 24 Oct 2025 02:00:41 +0800 Subject: mm, swap: cleanup swap entry allocation parameter We no longer need this GFP parameter after commit 8578e0c00dcf ("mm, swap: use the swap table for the swap cache and switch API"). Before that commit the GFP parameter is already almost identical for all callers, so nothing changed by that commit. Swap table just moved the GFP to lower layer and make it more defined and changes depend on atomic or sleep allocation. Now this parameter is no longer used, just remove it. No behavior change. Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-3-a709469052e7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Baolin Wang Reviewed-by: David Hildenbrand Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/shmem.c | 2 +- mm/swapfile.c | 3 +-- mm/vmscan.c | 4 ++-- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index e818fbade1e2..a4b264817735 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -462,7 +462,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); +int folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); @@ -560,7 +560,7 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) +static inline int folio_alloc_swap(struct folio *folio) { return -EINVAL; } diff --git a/mm/shmem.c b/mm/shmem.c index da1df4270309..e1dc2d8e939c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1617,7 +1617,7 @@ try_split: folio_mark_uptodate(folio); } - if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { + if (!folio_alloc_swap(folio)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; diff --git a/mm/swapfile.c b/mm/swapfile.c index 808052319c0b..d87b562ae661 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1417,7 +1417,6 @@ static bool swap_sync_discard(void) /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap - * @gfp: gfp mask for shadow nodes * * Allocate swap space for the folio and add the folio to the * swap cache. @@ -1425,7 +1424,7 @@ static bool swap_sync_discard(void) * Context: Caller needs to hold the folio lock. * Return: Whether the folio was added to the swap cache. */ -int folio_alloc_swap(struct folio *folio, gfp_t gfp) +int folio_alloc_swap(struct folio *folio) { unsigned int order = folio_order(folio); unsigned int size = 1 << order; diff --git a/mm/vmscan.c b/mm/vmscan.c index ecc90517b791..c23c9616052a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1318,7 +1318,7 @@ retry: split_folio_to_list(folio, folio_list)) goto activate_locked; } - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) { + if (folio_alloc_swap(folio)) { int __maybe_unused order = folio_order(folio); if (!folio_test_large(folio)) @@ -1334,7 +1334,7 @@ retry: } #endif count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) + if (folio_alloc_swap(folio)) goto activate_locked_split; } /* -- cgit v1.2.3 From adf7d6cdd716e1f3826789befc453c961dfafcf2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:25 -0700 Subject: mm/damon/core: add damon_target->obsolete for pin-point removal Patch series "mm/damon: support pin-point targets removal". DAMON maintains the targets in a list, and allows committing only an entire list of targets having the new parameters. Targets having same index on the lists are treated as matching source and destination targets. If an existing target cannot find a matching one in the sources list, the target is removed. This means that there is no way to remove only a specific monitoring target in the middle of the current targets list. Such pin-point target removal is really needed in some use cases, though. Monitoring access patterns on virtual address spaces of processes that spawned from the same ancestor is one example. If a process of the group is terminated, the user may want to remove the matching DAMON target as soon as possible, to save in-kernel memory usage for the unnecessary target data. The user may also want to do that without turning DAMON off or removing unnecessary targets, to keep the current monitoring results for other active processes. Extend DAMON kernel API and sysfs ABI to support the pin-point removal in the following way. For API, add a new damon_target field, namely 'obsolete'. If the field on parameters commit source target is set, it means the matching destination target is obsolete. Then the parameters commit logic removes the destination target from the existing targets list. For sysfs ABI, add a new file under the target directory, namely 'obsolete_target'. It is connected with the 'obsolete' field of the commit source targets, so internally using the new API. Also add a selftest for the new feature. The related helper scripts for manipulating the sysfs interface and dumping in-kernel DAMON status are also extended for this. Note that the selftest part was initially posted as an individual RFC series [1], but now merged into this one. Bijan Tabatabai has originally reported this issue, and participated in this solution design on a GitHub issue [1] for DAMON user-space tool. This patch (of 9): DAMON's monitoring targets parameters update function, damon_commit_targets(), is not providing a way to remove a target in the middle of the existing targets list. Extend the API by adding a field to struct damon_target. If the field of a damon_commit_targets() source target is set, it indicates the matching target on the existing targets list is obsolete. damon_commit_targets() understands that and removes those from the list, while respecting the index based matching for other non-obsolete targets. Link: https://lkml.kernel.org/r/20251023012535.69625-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251023012535.69625-2-sj@kernel.org Link: https://github.com/damonitor/damo/issues/36 [1] Signed-off-by: SeongJae Park Reviewed-by: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 9ee026c2db53..f3566b978cdf 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,17 +91,23 @@ struct damon_region { * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. + * @obsolete: Whether the commit destination target is obsolete. * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The * @pid should be set for appropriate &struct damon_operations including the * virtual address spaces monitoring operations. + * + * @obsolete is used only for damon_commit_targets() source targets, to specify + * the matching destination targets are obsolete. Read damon_commit_targets() + * to see how it is handled. */ struct damon_target { struct pid *pid; unsigned int nr_regions; struct list_head regions_list; struct list_head list; + bool obsolete; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index 769da97fcb26..06ad359024ad 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -479,6 +479,7 @@ struct damon_target *damon_new_target(void) t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); INIT_LIST_HEAD(&t->list); + t->obsolete = false; return t; } @@ -1187,7 +1188,11 @@ static int damon_commit_targets( damon_for_each_target_safe(dst_target, next, dst) { src_target = damon_nth_target(i++, src); - if (src_target) { + /* + * If src target is obsolete, do not commit the parameters to + * the dst target, and further remove the dst target. + */ + if (src_target && !src_target->obsolete) { err = damon_commit_target( dst_target, damon_target_has_pid(dst), src_target, damon_target_has_pid(src), @@ -1210,6 +1215,9 @@ static int damon_commit_targets( damon_for_each_target_safe(src_target, next, src) { if (j++ < i) continue; + /* target to remove has no matching dst */ + if (src_target->obsolete) + return -EINVAL; new_target = damon_new_target(); if (!new_target) return -ENOMEM; -- cgit v1.2.3 From b734b9d973ccd7ad1cfebc2e1f7db693824a37ef Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 24 Oct 2025 10:09:02 +0100 Subject: mm/vma: small VMA lock cleanups We declare vma_start_read() as a static function in mm/mmap_lock.c, so there is no need to provide a stub for !CONFIG_PER_VMA_LOCK. __is_vma_write_locked() is declared in a header and should therefore be static inline. Put parens around (refcnt & VMA_LOCK_OFFSET) in is_vma_writer_only() to make precedence clear. Link: https://lkml.kernel.org/r/20251024090902.1118174-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 2c9fffa58714..e05da70dc0cb 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -130,7 +130,7 @@ static inline bool is_vma_writer_only(int refcnt) * a detached vma happens only in vma_mark_detached() and is a rare * case, therefore most of the time there will be no unnecessary wakeup. */ - return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; + return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) @@ -183,7 +183,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -281,9 +281,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int return true; } static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) - { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) -- cgit v1.2.3 From 272239dc8fcb109b9f1ec1a73bb85405dac92eda Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 21 Oct 2025 03:56:38 +0100 Subject: mm: make INVALID_PHYS_ADDR a generic macro INVALID_PHYS_ADDR has very similar definitions across the code base. Hence just move that inside header for more generic usage. Also drop the now redundant ones which are no longer required. Link: https://lkml.kernel.org/r/20251021025638.2420216-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Alexander Gordeev [s390] Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 2 -- arch/s390/boot/vmem.c | 1 - drivers/vdpa/vdpa_user/iova_domain.h | 2 -- include/linux/mm.h | 2 ++ kernel/dma/swiotlb.c | 2 -- 5 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b8d37eb037fc..94e29e3574ff 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -470,8 +470,6 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, mutex_unlock(&fixmap_lock); } -#define INVALID_PHYS_ADDR (-1ULL) - static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, enum pgtable_type pgtable_type) { diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index cea3de4dce8c..fbe64ffdfb96 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -16,7 +16,6 @@ #include "decompressor.h" #include "boot.h" -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) struct ctlreg __bootdata_preserved(s390_invalid_asce); #ifdef CONFIG_PROC_FS diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index 775cad5238f3..a923971a64f5 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -17,8 +17,6 @@ #define IOVA_START_PFN 1 -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) - #define BOUNCE_MAP_SHIFT 12 #define BOUNCE_MAP_SIZE (1 << BOUNCE_MAP_SHIFT) #define BOUNCE_MAP_MASK (~(BOUNCE_MAP_SIZE - 1)) diff --git a/include/linux/mm.h b/include/linux/mm.h index a6fd9f5aaf30..7bcd9e6fbc3c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -105,6 +105,8 @@ extern int mmap_rnd_compat_bits __read_mostly; # endif #endif +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + #include #include diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0d37da3d95b6..a547c7693135 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -61,8 +61,6 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) - /** * struct io_tlb_slot - IO TLB slot descriptor * @orig_addr: The original address corresponding to a mapped entry. -- cgit v1.2.3 From 8e689f8ea45ffdae20350246dd37d124d7092c92 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 28 Oct 2025 11:43:07 +0800 Subject: mm/swap: do not choose swap device according to numa node Patch series "mm/swapfile.c: select swap devices of default priority round robin", v5. Currently, on system with multiple swap devices, swap allocation will select one swap device according to priority. The swap device with the highest priority will be chosen to allocate firstly. People can specify a priority from 0 to 32767 when swapon a swap device, or the system will set it from -2 then downwards by default. Meanwhile, on NUMA system, the swap device with node_id will be considered first on that NUMA node of the node_id. In the current code, an array of plist, swap_avail_heads[nid], is used to organize swap devices on each NUMA node. For each NUMA node, there is a plist organizing all swap devices. The 'prio' value in the plist is the negated value of the device's priority due to plist being sorted from low to high. The swap device owning one node_id will be promoted to the front position on that NUMA node, then other swap devices are put in order of their default priority. E.g I got a system with 8 NUMA nodes, and I setup 4 zram partition as swap devices. Current behaviour: their priorities will be(note that -1 is skipped): NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 0B -2 /dev/zram1 partition 16G 0B -3 /dev/zram2 partition 16G 0B -4 /dev/zram3 partition 16G 0B -5 And their positions in the 8 swap_avail_lists[nid] will be: swap_avail_lists[0]: /* node 0's available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:1 prio:3 prio:4 prio:5 swap_avali_lists[1]: /* node 1's available swap device list */ zram1 -> zram0 -> zram2 -> zram3 prio:1 prio:2 prio:4 prio:5 swap_avail_lists[2]: /* node 2's available swap device list */ zram2 -> zram0 -> zram1 -> zram3 prio:1 prio:2 prio:3 prio:5 swap_avail_lists[3]: /* node 3's available swap device list */ zram3 -> zram0 -> zram1 -> zram2 prio:1 prio:2 prio:3 prio:4 swap_avail_lists[4-7]: /* node 4,5,6,7's available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:2 prio:3 prio:4 prio:5 The adjustment for swap device with node_id intended to decrease the pressure of lock contention for one swap device by taking different swap device on different node. The adjustment was introduced in commit a2468cc9bfdf ("swap: choose swap device according to numa node"). However, the adjustment is a little coarse-grained. On the node, the swap device sharing the node's id will always be selected firstly by node's CPUs until exhausted, then next one. And on other nodes where no swap device shares its node id, swap device with priority '-2' will be selected firstly until exhausted, then next with priority '-3'. This is the swapon output during the process high pressure vm-scability test is being taken. It's clearly showing zram0 is heavily exploited until exhausted. =================================== [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 15.7G -2 /dev/zram1 partition 16G 3.4G -3 /dev/zram2 partition 16G 3.4G -4 /dev/zram3 partition 16G 2.6G -5 The node based strategy on selecting swap device is much better then the old way one by one selecting swap device. However it is still unreasonable because swap devices are assumed to have similar accessing speed if no priority is specified when swapon. It's unfair and doesn't make sense just because one swap device is swapped on firstly, its priority will be higher than the one swapped on later. So in this patchset, change is made to select the swap device round robin if default priority. In code, the plist array swap_avail_heads[nid] is replaced with a plist swap_avail_head which reverts commit a2468cc9bfdf. Meanwhile, on top of the revert, further change is taken to make any device w/o specified priority get the same default priority '-1'. Surely, swap device with specified priority are always put foremost, this is not impacted. If you care about their different accessing speed, then use 'swapon -p xx' to deploy priority for your swap devices. New behaviour: swap_avail_list: /* one global available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:1 prio:1 prio:1 prio:1 This is the swapon output during the process high pressure vm-scability being taken, all is selected round robin: ======================================= [root@hp-dl385g10-03 linux]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 12.6G -1 /dev/zram1 partition 16G 12.6G -1 /dev/zram2 partition 16G 12.6G -1 /dev/zram3 partition 16G 12.6G -1 With the change, we can see about 18% efficiency promotion as below: vm-scability test: ================== Test with: usemem --init-time -O -y -x -n 31 2G (4G memcg, zram as swap) Before: After: System time: 637.92 s 526.74 s (lower is better) Sum Throughput: 3546.56 MB/s 4207.56 MB/s (higher is better) Single process Throughput: 114.40 MB/s 135.72 MB/s (higher is better) free latency: 10138455.99 us 6810119.01 us (low is better) This patch (of 2): This reverts commit a2468cc9bfdf ("swap: choose swap device according to numa node"). After this patch, the behaviour will change back to pre-commit a2468cc9bfdf. Means the priority will be set from -1 then downwards by default, and when swapping, it will exhault swap device one by one according to priority from high to low. This is preparation work for later change. [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 16G -1 /dev/zram1 partition 16G 966.2M -2 /dev/zram2 partition 16G 0B -3 /dev/zram3 partition 16G 0B -4 Link: https://lkml.kernel.org/r/20251028034308.929550-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20251028034308.929550-2-bhe@redhat.com Signed-off-by: Baoquan He Suggested-by: Chris Li Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Kairui Song Cc: Barry Song Cc: Kemeng Shi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/index.rst | 1 - Documentation/admin-guide/mm/swap_numa.rst | 78 ----------------------------- include/linux/swap.h | 11 +--- mm/swapfile.c | 80 ++++++------------------------ 4 files changed, 15 insertions(+), 155 deletions(-) delete mode 100644 Documentation/admin-guide/mm/swap_numa.rst (limited to 'include/linux') diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index ebc83ca20fdc..bbb563cba5d2 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -39,7 +39,6 @@ the Linux memory management. shrinker_debugfs slab soft-dirty - swap_numa transhuge userfaultfd zswap diff --git a/Documentation/admin-guide/mm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst deleted file mode 100644 index 2e630627bcee..000000000000 --- a/Documentation/admin-guide/mm/swap_numa.rst +++ /dev/null @@ -1,78 +0,0 @@ -=========================================== -Automatically bind swap device to numa node -=========================================== - -If the system has more than one swap device and swap device has the node -information, we can make use of this information to decide which swap -device to use in get_swap_pages() to get better performance. - - -How to use this feature -======================= - -Swap device has priority and that decides the order of it to be used. To make -use of automatically binding, there is no need to manipulate priority settings -for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and -swapB, with swapA attached to node 0 and swapB attached to node 1, are going -to be swapped on. Simply swapping them on by doing:: - - # swapon /dev/swapA - # swapon /dev/swapB - -Then node 0 will use the two swap devices in the order of swapA then swapB and -node 1 will use the two swap devices in the order of swapB then swapA. Note -that the order of them being swapped on doesn't matter. - -A more complex example on a 4 node machine. Assume 6 swap devices are going to -be swapped on: swapA and swapB are attached to node 0, swapC is attached to -node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. -The way to swap them on is the same as above:: - - # swapon /dev/swapA - # swapon /dev/swapB - # swapon /dev/swapC - # swapon /dev/swapD - # swapon /dev/swapE - # swapon /dev/swapF - -Then node 0 will use them in the order of:: - - swapA/swapB -> swapC -> swapD -> swapE -> swapF - -swapA and swapB will be used in a round robin mode before any other swap device. - -node 1 will use them in the order of:: - - swapC -> swapA -> swapB -> swapD -> swapE -> swapF - -node 2 will use them in the order of:: - - swapD/swapE -> swapA -> swapB -> swapC -> swapF - -Similaly, swapD and swapE will be used in a round robin mode before any -other swap devices. - -node 3 will use them in the order of:: - - swapF -> swapA -> swapB -> swapC -> swapD -> swapE - - -Implementation details -====================== - -The current code uses a priority based list, swap_avail_list, to decide -which swap device to use and if multiple swap devices share the same -priority, they are used round robin. This change here replaces the single -global swap_avail_list with a per-numa-node list, i.e. for each numa node, -it sees its own priority based list of available swap devices. Swap -device's priority can be promoted on its matching node's swap_avail_list. - -The current swap device's priority is set as: user can set a >=0 value, -or the system will pick one starting from -1 then downwards. The priority -value in the swap_avail_list is the negated value of the swap device's -due to plist being sorted from low to high. The new policy doesn't change -the semantics for priority >=0 cases, the previous starting from -1 then -downwards now becomes starting from -2 then downwards and -1 is reserved -as the promoted value. So if multiple swap devices are attached to the same -node, they will all be promoted to priority -1 on that node's plist and will -be used round robin before any other swap devices. diff --git a/include/linux/swap.h b/include/linux/swap.h index a4b264817735..38ca3df68716 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -301,16 +301,7 @@ struct swap_info_struct { struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ - struct plist_node avail_lists[]; /* - * entries in swap_avail_heads, one - * entry per node. - * Must be last as the number of the - * array is nr_node_ids, which is not - * a fixed value so have to allocate - * dynamically. - * And it has to be an array so that - * plist_for_each_* can work. - */ + struct plist_node avail_list; /* entry in swap_avail_head */ }; static inline swp_entry_t page_swap_entry(struct page *page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 125d893bb706..ce3580e2f4f4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority = -1; +static int least_priority; unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; @@ -103,7 +103,7 @@ static PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static struct plist_head *swap_avail_heads; +static PLIST_HEAD(swap_avail_head); static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -1130,7 +1130,6 @@ done: /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { - int nid; unsigned long pages; spin_lock(&swap_avail_lock); @@ -1159,8 +1158,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) goto skip; } - for_each_node(nid) - plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_del(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1169,7 +1167,6 @@ skip: /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { - int nid; long val; unsigned long pages; @@ -1202,8 +1199,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) goto skip; } - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_add(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1346,16 +1342,14 @@ static bool swap_alloc_fast(swp_entry_t *entry, static bool swap_alloc_slow(swp_entry_t *entry, int order) { - int node; unsigned long offset; struct swap_info_struct *si, *next; - node = numa_node_id(); spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { /* Rotate the device and switch to a new cluster */ - plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); + plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE); @@ -1380,7 +1374,7 @@ start_over: * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ - if (plist_node_empty(&next->avail_lists[node])) + if (plist_node_empty(&si->avail_list)) goto start_over; } spin_unlock(&swap_avail_lock); @@ -1394,11 +1388,10 @@ start_over: static bool swap_sync_discard(void) { bool ret = false; - int nid = numa_node_id(); struct swap_info_struct *si, *next; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { if (si->flags & SWP_PAGE_DISCARD) @@ -2709,25 +2702,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static int swap_node(struct swap_info_struct *si) -{ - struct block_device *bdev; - - if (si->bdev) - bdev = si->bdev; - else - bdev = si->swap_file->f_inode->i_sb->s_bdev; - - return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; -} - static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { - int i; - if (prio >= 0) si->prio = prio; else @@ -2737,16 +2716,7 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, * low-to-high, while swap ordering is high-to-low */ si->list.prio = -si->prio; - for_each_node(i) { - if (si->prio >= 0) - si->avail_lists[i].prio = -si->prio; - else { - if (swap_node(si) == i) - si->avail_lists[i].prio = 1; - else - si->avail_lists[i].prio = -si->prio; - } - } + si->avail_list.prio = -si->prio; si->swap_map = swap_map; si->cluster_info = cluster_info; si->zeromap = zeromap; @@ -2919,15 +2889,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) del_from_avail_list(p, true); if (p->prio < 0) { struct swap_info_struct *si = p; - int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; - for_each_node(nid) { - if (si->avail_lists[nid].prio != 1) - si->avail_lists[nid].prio--; - } + si->avail_list.prio--; } least_priority++; } @@ -3168,9 +3134,8 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; - int i; - p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); + p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -3209,8 +3174,7 @@ static struct swap_info_struct *alloc_swap_info(void) } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); - for_each_node(i) - plist_node_init(&p->avail_lists[i], 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { @@ -3467,9 +3431,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!swap_avail_heads) - return -ENOMEM; - si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); @@ -4079,7 +4040,6 @@ static bool __has_usable_swap(void) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; - int nid = folio_nid(folio); if (!(gfp & __GFP_IO)) return; @@ -4098,8 +4058,8 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], - avail_lists[nid]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, + avail_list) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; @@ -4111,18 +4071,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) static int __init swapfile_init(void) { - int nid; - - swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), - GFP_KERNEL); - if (!swap_avail_heads) { - pr_emerg("Not enough memory for swap heads, swap is disabled\n"); - return -ENOMEM; - } - - for_each_node(nid) - plist_head_init(&swap_avail_heads[nid]); - swapfile_maximum_size = arch_max_swapfile_size(); /* -- cgit v1.2.3 From 1a4f70f6851a1916c4f0e52731c7ecfe99bf36e6 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:28 +0000 Subject: mm: convert memory block states (MEM_*) macros to enum Patch series "mm: Convert memory block states (MEM_*) macros to enums", v2. The MEM_* constants indicating the state of a memory block are currently defined as macros, meaning their definitions will be omitted from the debuginfo on most kernel builds. This makes it harder for debuggers to correctly map the block state at runtime, which can be quite useful when analysing errors related to memory hot plugging and unplugging with tools such as drgn. Converting the constants to an enum ensures the correct information is emitted by the compiler and available for the debugger, without needing to hard-code them into the debugger and track their changes. This patch series aims to replace the current macros with a newly created enum named memory_block_state, while also taking advantage of the compile time guarantees that we get when using enums. The first patch does the conversion of the macros to an enum, while the 2nd and 3rd patches use this enum to clean up some type declarations and make sure that only valid values are used. This patch (of 3): Converting the MEM_* constants from macros to an enum ensures that their values will be correctly emitted in the debug symbols, making it easier to trace the meaning of each value when debugging with tools such as drgn, without the need to hard-code the values. Since the values are mutually exclusive and they are not exposed directly to userspace, I also dropped the misleading pattern (1< Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/memory.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 0c214256216f..f4e358477c6a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -64,6 +64,18 @@ struct memory_group { }; }; +enum memory_block_state { + /* These states are exposed to userspace as text strings in sysfs */ + MEM_ONLINE, /* exposed to userspace */ + MEM_GOING_OFFLINE, /* exposed to userspace */ + MEM_OFFLINE, /* exposed to userspace */ + MEM_GOING_ONLINE, + MEM_CANCEL_ONLINE, + MEM_CANCEL_OFFLINE, + MEM_PREPARE_ONLINE, + MEM_FINISH_OFFLINE, +}; + struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ @@ -89,16 +101,6 @@ int arch_get_memory_phys_device(unsigned long start_pfn); unsigned long memory_block_size_bytes(void); int set_memory_block_size_order(unsigned int order); -/* These states are exposed to userspace as text strings in sysfs */ -#define MEM_ONLINE (1<<0) /* exposed to userspace */ -#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ -#define MEM_OFFLINE (1<<2) /* exposed to userspace */ -#define MEM_GOING_ONLINE (1<<3) -#define MEM_CANCEL_ONLINE (1<<4) -#define MEM_CANCEL_OFFLINE (1<<5) -#define MEM_PREPARE_ONLINE (1<<6) -#define MEM_FINISH_OFFLINE (1<<7) - struct memory_notify { /* * The altmap_start_pfn and altmap_nr_pages fields are designated for -- cgit v1.2.3 From 8bc7ba3d265d6ee698de4b1941b7e8f7d91a0562 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:30 +0000 Subject: mm: change type of state in struct memory_block The state of a memory block should be restricted to values specified in the documentation of the memory hotplug API. However, since the state field in the memory_block struct was defined as an unsigned long, this restriction was not enforced at compile time. With the introduction of the enum memory_block_state, it is now possible to incorporate the desired semantics in the field declaration and enforce these restrictions at compile time. [akpm@linux-foundation.org: fix whitespace, per Randy] Link: https://lkml.kernel.org/r/20251029195617.2210700-3-linux@israelbatista.dev.br Signed-off-by: Israel Batista Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- drivers/base/memory.c | 2 +- include/linux/memory.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 6d84a02cfa5d..3d17dd774947 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -198,7 +198,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, break; default: WARN_ON(1); - return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); + return sysfs_emit(buf, "ERROR-UNKNOWN-%d\n", mem->state); } return sysfs_emit(buf, "%s\n", output); diff --git a/include/linux/memory.h b/include/linux/memory.h index f4e358477c6a..ca20cbdd71f2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -78,7 +78,7 @@ enum memory_block_state { struct memory_block { unsigned long start_section_nr; - unsigned long state; /* serialized by the dev->lock */ + enum memory_block_state state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ /* -- cgit v1.2.3 From ed1f8855dd7b82a0ad87960b1729a3e848dc5589 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:32 +0000 Subject: mm: change type of parameter for memory_notify memory_notify() is responsible for sending events related to memory hotplugging to a notification queue. Since all the events must match one of the values from the enum memory_block_state, it is appropriate to change the function parameter type to make this condition explicit at compile time. Link: https://lkml.kernel.org/r/20251029195617.2210700-4-linux@israelbatista.dev.br Signed-off-by: Israel Batista Acked-by: Mike Rapoport (Microsoft) Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 3d17dd774947..c03f3b5e5e6f 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -204,9 +204,9 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, return sysfs_emit(buf, "%s\n", output); } -int memory_notify(unsigned long val, void *v) +int memory_notify(enum memory_block_state state, void *v) { - return blocking_notifier_call_chain(&memory_chain, val, v); + return blocking_notifier_call_chain(&memory_chain, state, v); } #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) diff --git a/include/linux/memory.h b/include/linux/memory.h index ca20cbdd71f2..ca3eb1db6cc8 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -141,7 +141,7 @@ static inline int register_memory_notifier(struct notifier_block *nb) static inline void unregister_memory_notifier(struct notifier_block *nb) { } -static inline int memory_notify(unsigned long val, void *v) +static inline int memory_notify(enum memory_block_state state, void *v) { return 0; } @@ -165,7 +165,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); -extern int memory_notify(unsigned long val, void *v); +extern int memory_notify(enum memory_block_state state, void *v); extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, -- cgit v1.2.3 From 2ec41967189cd65a8f79c760dd1b50c4f56e8ac6 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sun, 2 Nov 2025 18:44:33 +0000 Subject: mm: handle poisoning of pfn without struct pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Poison (or ECC) errors can be very common on a large size cluster. The kernel MM currently does not handle ECC errors / poison on a memory region that is not backed by struct pages. If a memory region mapped using remap_pfn_range() for example, but not added to the kernel, MM will not have associated struct pages. Add a new mechanism to handle memory failure on such memory. Make kernel MM expose a function to allow modules managing the device memory to register the device memory SPA and the address space associated it. MM maintains this information as an interval tree. On poison, MM can search for the range that the poisoned PFN belong and use the address_space to determine the mapping VMA. In this implementation, kernel MM follows the following sequence that is largely similar to the memory_failure() handler for struct page backed memory: 1. memory_failure() is triggered on reception of a poison error. An absence of struct page is detected and consequently memory_failure_pfn() is executed. 2. memory_failure_pfn() collects the processes mapped to the PFN. 3. memory_failure_pfn() sends SIGBUS to all the processes mapping the faulty PFN using kill_procs(). Note that there is one primary difference versus the handling of the poison on struct pages, which is to skip unmapping to the faulty PFN. This is done to handle the huge PFNMAP support added recently [1] that enables VM_PFNMAP vmas to map at PMD or PUD level. A poison to a PFN mapped in such as way would need breaking the PMD/PUD mapping into PTEs that will get mirrored into the S2. This can greatly increase the cost of table walks and have a major performance impact. Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1] Link: https://lkml.kernel.org/r/20251102184434.2406-3-ankita@nvidia.com Signed-off-by: Ankit Agrawal Cc: Aniket Agashe Cc: Borislav Betkov Cc: David Hildenbrand Cc: Hanjun Guo Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Kevin Tian Cc: Kirti Wankhede Cc: Len Brown Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Matthew R. Ochs Cc: Mauro Carvalho Chehab Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Neo Jia Cc: Peter Zijlstra Cc: Shuai Xue Cc: Smita Koralahalli Channabasappa Cc: Suren Baghdasaryan Cc: Tarun Gupta Cc: Uwe Kleine-König Cc: Vikram Sethi Cc: Vlastimil Babka Cc: Zhi Wang Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + include/linux/memory-failure.h | 17 +++++ include/linux/mm.h | 1 + include/ras/ras_event.h | 1 + mm/Kconfig | 1 + mm/memory-failure.c | 145 ++++++++++++++++++++++++++++++++++++++++- 6 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 include/linux/memory-failure.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 2625bc3d53d8..5cf6873569d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11557,6 +11557,7 @@ M: Miaohe Lin R: Naoya Horiguchi L: linux-mm@kvack.org S: Maintained +F: include/linux/memory-failure.h F: mm/hwpoison-inject.c F: mm/memory-failure.c diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 000000000000..bc326503d2d2 --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include + +struct pfn_address_space; + +struct pfn_address_space { + struct interval_tree_node node; + struct address_space *mapping; +}; + +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7bcd9e6fbc3c..b636d12bb651 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4285,6 +4285,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index c8cd0f00c845..fecfeb7c8be7 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -375,6 +375,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/mm/Kconfig b/mm/Kconfig index eae03b14f7de..d548976d0e0a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -741,6 +741,7 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select RAS + select INTERVAL_TREE help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 560884dd6250..77391b6f9f76 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -154,6 +155,10 @@ static const struct ctl_table memory_failure_table[] = { } }; +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; + +static DEFINE_MUTEX(pfn_space_lock); + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -885,6 +890,7 @@ static const char * const action_page_types[] = { [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_ALREADY_POISONED] = "already poisoned page", + [MF_MSG_PFN_MAP] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1277,7 +1283,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - if (type != MF_MSG_ALREADY_POISONED) { + if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) { num_poisoned_pages_inc(pfn); update_per_node_mf_stats(pfn, result); } @@ -2147,6 +2153,135 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, kill_procs(&tokill, true, pfn, flags); } +int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + return -EBUSY; + + interval_tree_insert(&pfn_space->node, &pfn_space_itree); + + return 0; +} +EXPORT_SYMBOL_GPL(register_pfn_address_space); + +void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + interval_tree_remove(&pfn_space->node, &pfn_space_itree); +} +EXPORT_SYMBOL_GPL(unregister_pfn_address_space); + +static void add_to_kill_pfn(struct task_struct *tsk, + struct vm_area_struct *vma, + struct list_head *to_kill, + unsigned long pfn) +{ + struct to_kill *tk; + + tk = kmalloc(sizeof(*tk), GFP_ATOMIC); + if (!tk) { + pr_info("Unable to kill proc %d\n", tsk->pid); + return; + } + + /* Check for pgoff not backed by struct page */ + tk->addr = vma_address(vma, pfn, 1); + tk->size_shift = PAGE_SHIFT; + + if (tk->addr == -EFAULT) + pr_info("Unable to find address %lx in %s\n", + pfn, tsk->comm); + + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Collect processes when the error hit a PFN not backed by struct page. + */ +static void collect_procs_pfn(struct address_space *mapping, + unsigned long pfn, struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + + i_mmap_lock_read(mapping); + rcu_read_lock(); + for_each_process(tsk) { + struct task_struct *t = tsk; + + t = task_early_kill(tsk, true); + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) { + if (vma->vm_mm == t->mm) + add_to_kill_pfn(t, vma, to_kill, pfn); + } + } + rcu_read_unlock(); + i_mmap_unlock_read(mapping); +} + +/** + * memory_failure_pfn - Handle memory failure on a page not backed by + * struct page. + * @pfn: Page Number of the corrupted page + * @flags: fine tune action taken + * + * Return: + * 0 - success, + * -EBUSY - Page PFN does not belong to any address space mapping. + */ +static int memory_failure_pfn(unsigned long pfn, int flags) +{ + struct interval_tree_node *node; + LIST_HEAD(tokill); + + scoped_guard(mutex, &pfn_space_lock) { + bool mf_handled = false; + + /* + * Modules registers with MM the address space mapping to + * the device memory they manage. Iterate to identify + * exactly which address space has mapped to this failing + * PFN. + */ + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); + + collect_procs_pfn(pfn_space->mapping, pfn, &tokill); + + mf_handled = true; + } + + if (!mf_handled) + return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED); + } + + /* + * Unlike System-RAM there is no possibility to swap in a different + * physical page at a given virtual address, so all userspace + * consumption of direct PFN memory necessitates SIGBUS (i.e. + * MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + + kill_procs(&tokill, true, pfn, flags); + + return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED); +} + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -2196,6 +2331,14 @@ int memory_failure(unsigned long pfn, int flags) if (res == 0) goto unlock_mutex; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + /* + * The PFN is not backed by struct page. + */ + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); -- cgit v1.2.3 From 197b3f3c70d61ff1c7ca24f66d567e06fe8ed3d9 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 12 Nov 2025 14:55:30 +0100 Subject: string: provide strends() Implement a function for checking if a string ends with a different string and add its kunit test cases. Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20251112-gpio-shared-v4-1-b51f97b1abd8@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/string.h | 18 ++++++++++++++++++ lib/tests/string_kunit.c | 13 +++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index fdd3442c6bcb..929d05d1247c 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -562,4 +562,22 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } +/** + * strends - Check if a string ends with another string. + * @str - NULL-terminated string to check against @suffix + * @suffix - NULL-terminated string defining the suffix to look for in @str + * + * Returns: + * True if @str ends with @suffix. False in all other cases. + */ +static inline bool strends(const char *str, const char *suffix) +{ + unsigned int str_len = strlen(str), suffix_len = strlen(suffix); + + if (str_len < suffix_len) + return false; + + return !(strcmp(str + str_len - suffix_len, suffix)); +} + #endif /* _LINUX_STRING_H_ */ diff --git a/lib/tests/string_kunit.c b/lib/tests/string_kunit.c index 0ed7448a26d3..f9a8e557ba77 100644 --- a/lib/tests/string_kunit.c +++ b/lib/tests/string_kunit.c @@ -602,6 +602,18 @@ static void string_test_memtostr(struct kunit *test) KUNIT_EXPECT_EQ(test, dest[7], '\0'); } +static void string_test_strends(struct kunit *test) +{ + KUNIT_EXPECT_TRUE(test, strends("foo-bar", "bar")); + KUNIT_EXPECT_TRUE(test, strends("foo-bar", "-bar")); + KUNIT_EXPECT_TRUE(test, strends("foobar", "foobar")); + KUNIT_EXPECT_TRUE(test, strends("foobar", "")); + KUNIT_EXPECT_FALSE(test, strends("bar", "foobar")); + KUNIT_EXPECT_FALSE(test, strends("", "foo")); + KUNIT_EXPECT_FALSE(test, strends("foobar", "ba")); + KUNIT_EXPECT_TRUE(test, strends("", "")); +} + static struct kunit_case string_test_cases[] = { KUNIT_CASE(string_test_memset16), KUNIT_CASE(string_test_memset32), @@ -623,6 +635,7 @@ static struct kunit_case string_test_cases[] = { KUNIT_CASE(string_test_strlcat), KUNIT_CASE(string_test_strtomem), KUNIT_CASE(string_test_memtostr), + KUNIT_CASE(string_test_strends), {} }; -- cgit v1.2.3 From eb374f764a7012eda28019266a6d9191670c4fa5 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 12 Nov 2025 14:55:35 +0100 Subject: gpio: provide gpiod_is_shared() Provide an interface allowing consumers to check if a GPIO descriptor represents a GPIO that can potentially be shared by multiple consumers at the same time. This is exposed to allow subsystems that already work around the limitations of the current non-exclusive GPIO handling in some ways, to gradually convert to relying on the new shared GPIO feature of GPIOLIB. Extend the gpiolib-shared module to mark the GPIO shared proxy descriptors with a flag checked by the new interface. Reviewed-by: Linus Walleij Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20251112-gpio-shared-v4-6-b51f97b1abd8@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-shared.c | 18 ++++++++++++++++++ drivers/gpio/gpiolib.c | 20 ++++++++++++++++++++ drivers/gpio/gpiolib.h | 1 + include/linux/gpio/consumer.h | 9 +++++++++ 4 files changed, 48 insertions(+) (limited to 'include/linux') diff --git a/drivers/gpio/gpiolib-shared.c b/drivers/gpio/gpiolib-shared.c index 1fcbf53b6eca..fa1d16635ea7 100644 --- a/drivers/gpio/gpiolib-shared.c +++ b/drivers/gpio/gpiolib-shared.c @@ -314,6 +314,24 @@ int gpio_device_setup_shared(struct gpio_device *gdev) guard(mutex)(&gpio_shared_lock); + list_for_each_entry(entry, &gpio_shared_list, list) { + list_for_each_entry(ref, &entry->refs, list) { + if (gdev->dev.parent == &ref->adev.dev) { + /* + * This is a shared GPIO proxy. Mark its + * descriptor as such and return here. + */ + __set_bit(GPIOD_FLAG_SHARED_PROXY, + &gdev->descs[0].flags); + return 0; + } + } + } + + /* + * This is not a shared GPIO proxy but it still may be the device + * exposing shared pins. Find them and create the proxy devices. + */ list_for_each_entry(entry, &gpio_shared_list, list) { if (!device_match_fwnode(&gdev->dev, entry->fwnode)) continue; diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 1e4c99179712..678d07dc768c 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3990,6 +3990,26 @@ int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name) } EXPORT_SYMBOL_GPL(gpiod_set_consumer_name); +/** + * gpiod_is_shared() - check if this GPIO can be shared by multiple consumers + * @desc: GPIO to inspect + * + * Returns: + * True if this GPIO can be shared by multiple consumers at once. False if it's + * a regular, exclusive GPIO. + * + * Note: + * This function returning true does not mean that this GPIO is currently being + * shared. It means the GPIO core has registered the fact that the firmware + * configuration indicates that it can be shared by multiple consumers and is + * in charge of arbitrating the access. + */ +bool gpiod_is_shared(const struct gpio_desc *desc) +{ + return test_bit(GPIOD_FLAG_SHARED_PROXY, &desc->flags); +} +EXPORT_SYMBOL_GPL(gpiod_is_shared); + /** * gpiod_to_irq() - return the IRQ corresponding to a GPIO * @desc: gpio whose IRQ will be returned (already requested) diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index 09ac92ed0853..abd870fb4a3b 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -205,6 +205,7 @@ struct gpio_desc { #define GPIOD_FLAG_EVENT_CLOCK_REALTIME 18 /* GPIO CDEV reports REALTIME timestamps in events */ #define GPIOD_FLAG_EVENT_CLOCK_HTE 19 /* GPIO CDEV reports hardware timestamps in events */ #define GPIOD_FLAG_SHARED 20 /* GPIO is shared by multiple consumers */ +#define GPIOD_FLAG_SHARED_PROXY 21 /* GPIO is a virtual proxy to a physically shared pin. */ /* Connection label */ struct gpio_desc_label __rcu *label; diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 00df68c51405..a8acb7c0b5af 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -167,6 +167,8 @@ int gpiod_cansleep(const struct gpio_desc *desc); int gpiod_to_irq(const struct gpio_desc *desc); int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name); +bool gpiod_is_shared(const struct gpio_desc *desc); + /* Convert between the old gpio_ and new gpiod_ interfaces */ struct gpio_desc *gpio_to_desc(unsigned gpio); int desc_to_gpio(const struct gpio_desc *desc); @@ -520,6 +522,13 @@ static inline int gpiod_set_consumer_name(struct gpio_desc *desc, return -EINVAL; } +static inline bool gpiod_is_shared(const struct gpio_desc *desc) +{ + /* GPIO can never have been requested */ + WARN_ON(desc); + return false; +} + static inline struct gpio_desc *gpio_to_desc(unsigned gpio) { return NULL; -- cgit v1.2.3 From b98994cb9bc24f5c7575c86650f96c384576fdfa Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Mon, 17 Nov 2025 02:54:19 +0000 Subject: mtd: spinand: esmt: add support for F50L1G41LC This adds support for ESMT F50L1G41LC, which appears to be an updated version of the already supported F50L1G41LB. Add esmt_8c SPI_NAND manufacturer to account for the newly used vendor ID with support for the ESMT F50L1G41LC chip. Link: https://github.com/openwrt/openwrt/pull/15214#issuecomment-3514824435 Signed-off-by: Daniel Golle Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/core.c | 1 + drivers/mtd/nand/spi/esmt.c | 24 ++++++++++++++++++++++++ include/linux/mtd/spinand.h | 1 + 3 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c index f92133b8e1a6..d207286572d8 100644 --- a/drivers/mtd/nand/spi/core.c +++ b/drivers/mtd/nand/spi/core.c @@ -1227,6 +1227,7 @@ static const struct nand_ops spinand_ops = { static const struct spinand_manufacturer *spinand_manufacturers[] = { &alliancememory_spinand_manufacturer, &ato_spinand_manufacturer, + &esmt_8c_spinand_manufacturer, &esmt_c8_spinand_manufacturer, &fmsh_spinand_manufacturer, &foresee_spinand_manufacturer, diff --git a/drivers/mtd/nand/spi/esmt.c b/drivers/mtd/nand/spi/esmt.c index 9a9325c0bc49..e60e4ac1fd6f 100644 --- a/drivers/mtd/nand/spi/esmt.c +++ b/drivers/mtd/nand/spi/esmt.c @@ -12,6 +12,7 @@ /* ESMT uses GigaDevice 0xc8 JECDEC ID on some SPI NANDs */ #define SPINAND_MFR_ESMT_C8 0xc8 +#define SPINAND_MFR_ESMT_8C 0x8c #define ESMT_F50L1G41LB_CFG_OTP_PROTECT BIT(7) #define ESMT_F50L1G41LB_CFG_OTP_LOCK \ @@ -184,6 +185,21 @@ static const struct spinand_fact_otp_ops f50l1g41lb_fact_otp_ops = { .read = spinand_fact_otp_read, }; + +static const struct spinand_info esmt_8c_spinand_table[] = { + SPINAND_INFO("F50L1G41LC", + SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0x2C), + NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1), + NAND_ECCREQ(1, 512), + SPINAND_INFO_OP_VARIANTS(&read_cache_variants, + &write_cache_variants, + &update_cache_variants), + 0, + SPINAND_ECCINFO(&f50l1g41lb_ooblayout, NULL), + SPINAND_USER_OTP_INFO(28, 2, &f50l1g41lb_user_otp_ops), + SPINAND_FACT_OTP_INFO(2, 0, &f50l1g41lb_fact_otp_ops)), +}; + static const struct spinand_info esmt_c8_spinand_table[] = { SPINAND_INFO("F50L1G41LB", SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0x01, 0x7f, @@ -224,6 +240,14 @@ static const struct spinand_info esmt_c8_spinand_table[] = { static const struct spinand_manufacturer_ops esmt_spinand_manuf_ops = { }; +const struct spinand_manufacturer esmt_8c_spinand_manufacturer = { + .id = SPINAND_MFR_ESMT_8C, + .name = "ESMT", + .chips = esmt_8c_spinand_table, + .nchips = ARRAY_SIZE(esmt_8c_spinand_table), + .ops = &esmt_spinand_manuf_ops, +}; + const struct spinand_manufacturer esmt_c8_spinand_manufacturer = { .id = SPINAND_MFR_ESMT_C8, .name = "ESMT", diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 927c10d78769..ce76f5c632e1 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -354,6 +354,7 @@ struct spinand_manufacturer { /* SPI NAND manufacturers */ extern const struct spinand_manufacturer alliancememory_spinand_manufacturer; extern const struct spinand_manufacturer ato_spinand_manufacturer; +extern const struct spinand_manufacturer esmt_8c_spinand_manufacturer; extern const struct spinand_manufacturer esmt_c8_spinand_manufacturer; extern const struct spinand_manufacturer fmsh_spinand_manufacturer; extern const struct spinand_manufacturer foresee_spinand_manufacturer; -- cgit v1.2.3 From e678c2a0063ec931642b3c5935fb0c3c1282b6b3 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 4 Nov 2025 14:16:44 +0200 Subject: PCI: Add Intel Nova Lake S audio Device ID Add Nova Lake S (NVL-S) audio Device ID The ID will be used by HDA legacy, SOF audio stack and the driver to determine which audio stack should be used (intel-dsp-config). Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Ranjani Sridharan Acked-by: Bjorn Helgaas Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20251104121650.21872-2-peter.ujfalusi@linux.intel.com --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 92ffc4373f6d..a9a089566b7c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3075,6 +3075,7 @@ #define PCI_DEVICE_ID_INTEL_5100_22 0x65f6 #define PCI_DEVICE_ID_INTEL_IOAT_SCNB 0x65ff #define PCI_DEVICE_ID_INTEL_HDA_FCL 0x67a8 +#define PCI_DEVICE_ID_INTEL_HDA_NVL_S 0x6e50 #define PCI_DEVICE_ID_INTEL_82371SB_0 0x7000 #define PCI_DEVICE_ID_INTEL_82371SB_1 0x7010 #define PCI_DEVICE_ID_INTEL_82371SB_2 0x7020 -- cgit v1.2.3 From 33cf66d88306663d16e4759e9d24766b0aaa2e17 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Nov 2025 17:01:31 +0100 Subject: sched/fair: Proportional newidle balance Add a randomized algorithm that runs newidle balancing proportional to its success rate. This improves schbench significantly: 6.18-rc4: 2.22 Mrps/s 6.18-rc4+revert: 2.04 Mrps/s 6.18-rc4+revert+random: 2.18 Mrps/S Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%: 6.17: -6% 6.17+revert: 0% 6.17+revert+random: -1% Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Tested-by: Chris Mason Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com Link: https://patch.msgid.link/20251107161739.770122091@infradead.org --- include/linux/sched/topology.h | 3 +++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 44 ++++++++++++++++++++++++++++++++++++++---- kernel/sched/features.h | 5 +++++ kernel/sched/sched.h | 7 +++++++ kernel/sched/topology.c | 6 ++++++ 6 files changed, 64 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index bbcfdf12aa6e..45c0022b91ce 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -92,6 +92,9 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ /* idle_balance() stats */ + unsigned int newidle_call; + unsigned int newidle_success; + unsigned int newidle_ratio; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 699db3f46df6..9f10cfbdc228 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); #ifdef CONFIG_SCHED_PROXY_EXEC DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); @@ -8489,6 +8490,8 @@ void __init sched_init_smp(void) { sched_init_numa(NUMA_NO_NODE); + prandom_init_once(&sched_rnd_state); + /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index abcbb67dd785..1855975b8248 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12224,11 +12224,27 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) +{ + sd->newidle_call++; + sd->newidle_success += success; + + if (sd->newidle_call >= 1024) { + sd->newidle_ratio = sd->newidle_success; + sd->newidle_call /= 2; + sd->newidle_success /= 2; + } +} + +static inline bool +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) { unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; unsigned long now = jiffies; + if (cost) + update_newidle_stats(sd, success); + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the @@ -12276,7 +12292,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * Decay the newidle max times here because this is a regular * visit to all the domains. */ - need_decay = update_newidle_cost(sd, 0); + need_decay = update_newidle_cost(sd, 0, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -12912,6 +12928,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) break; if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned int weight = 1; + + if (sched_feat(NI_RANDOM)) { + /* + * Throw a 1k sided dice; and only run + * newidle_balance according to the success + * rate. + */ + u32 d1k = sched_rng() % 1024; + weight = 1 + sd->newidle_ratio; + if (d1k > weight) { + update_newidle_stats(sd, 0); + continue; + } + weight = (1024 + weight/2) / weight; + } pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, @@ -12919,10 +12951,14 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 0607def744af..980d92bab8ab 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) + +/* + * Do newidle balancing proportional to its success rate using randomization. + */ +SCHED_FEAT(NI_RANDOM, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index def9ab7b59d4..b419a4d98461 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H +#include #include #include #include @@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p) } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); + +static inline u32 sched_rng(void) +{ + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); +} #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() this_cpu_ptr(&runqueues) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 711076aa4980..cf643a5ddedd 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1669,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, + + /* 50% success rate */ + .newidle_call = 512, + .newidle_success = 256, + .newidle_ratio = 512, + .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -- cgit v1.2.3 From 96498e804cb6629e02747336a0a33e4955449732 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 17 Nov 2025 17:12:47 +0100 Subject: spi: davinci: remove platform data header There are no longer any board files including the DaVinci SPI platform data header. Let's move the bits and pieces that are used in the driver into the driver .c file itself and remove the header. Signed-off-by: Bartosz Golaszewski Link: https://patch.msgid.link/20251117-davinci-spi-v2-1-cd799d17f04a@linaro.org Signed-off-by: Mark Brown --- drivers/spi/spi-davinci.c | 64 ++++++++++++++++++++++++++- include/linux/platform_data/spi-davinci.h | 73 ------------------------------- 2 files changed, 62 insertions(+), 75 deletions(-) delete mode 100644 include/linux/platform_data/spi-davinci.h (limited to 'include/linux') diff --git a/drivers/spi/spi-davinci.c b/drivers/spi/spi-davinci.c index a29934422356..21a14e800eed 100644 --- a/drivers/spi/spi-davinci.c +++ b/drivers/spi/spi-davinci.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -19,8 +20,6 @@ #include #include -#include - #define CS_DEFAULT 0xFF #define SPIFMT_PHASE_MASK BIT(16) @@ -98,8 +97,69 @@ #define SPIDEF 0x4c #define SPIFMT0 0x50 +#define SPI_IO_TYPE_POLL 1 +#define SPI_IO_TYPE_DMA 2 + #define DMA_MIN_BYTES 16 +enum { + SPI_VERSION_1, /* For DM355/DM365/DM6467 */ + SPI_VERSION_2, /* For DA8xx */ +}; + +/** + * struct davinci_spi_platform_data - Platform data for SPI master device on DaVinci + * + * @version: version of the SPI IP. Different DaVinci devices have slightly + * varying versions of the same IP. + * @num_chipselect: number of chipselects supported by this SPI master + * @intr_line: interrupt line used to connect the SPI IP to the ARM interrupt + * controller withn the SoC. Possible values are 0 and 1. + * @prescaler_limit: max clock prescaler value + * @cshold_bug: set this to true if the SPI controller on your chip requires + * a write to CSHOLD bit in between transfers (like in DM355). + * @dma_event_q: DMA event queue to use if SPI_IO_TYPE_DMA is used for any + * device on the bus. + */ +struct davinci_spi_platform_data { + u8 version; + u8 num_chipselect; + u8 intr_line; + u8 prescaler_limit; + bool cshold_bug; + enum dma_event_q dma_event_q; +}; + +/** + * struct davinci_spi_config - Per-chip-select configuration for SPI slave devices + * + * @wdelay: amount of delay between transmissions. Measured in number of + * SPI module clocks. + * @odd_parity: polarity of parity flag at the end of transmit data stream. + * 0 - odd parity, 1 - even parity. + * @parity_enable: enable transmission of parity at end of each transmit + * data stream. + * @io_type: type of IO transfer. Choose between polled, interrupt and DMA. + * @timer_disable: disable chip-select timers (setup and hold) + * @c2tdelay: chip-select setup time. Measured in number of SPI module clocks. + * @t2cdelay: chip-select hold time. Measured in number of SPI module clocks. + * @t2edelay: transmit data finished to SPI ENAn pin inactive time. Measured + * in number of SPI clocks. + * @c2edelay: chip-select active to SPI ENAn signal active time. Measured in + * number of SPI clocks. + */ +struct davinci_spi_config { + u8 wdelay; + u8 odd_parity; + u8 parity_enable; + u8 io_type; + u8 timer_disable; + u8 c2tdelay; + u8 t2cdelay; + u8 t2edelay; + u8 c2edelay; +}; + /* SPI Controller driver's private data. */ struct davinci_spi { struct spi_bitbang bitbang; diff --git a/include/linux/platform_data/spi-davinci.h b/include/linux/platform_data/spi-davinci.h deleted file mode 100644 index 2cb5cc70fd9d..000000000000 --- a/include/linux/platform_data/spi-davinci.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright 2009 Texas Instruments. - */ - -#ifndef __ARCH_ARM_DAVINCI_SPI_H -#define __ARCH_ARM_DAVINCI_SPI_H - -#include - -#define SPI_INTERN_CS 0xFF - -enum { - SPI_VERSION_1, /* For DM355/DM365/DM6467 */ - SPI_VERSION_2, /* For DA8xx */ -}; - -/** - * davinci_spi_platform_data - Platform data for SPI master device on DaVinci - * - * @version: version of the SPI IP. Different DaVinci devices have slightly - * varying versions of the same IP. - * @num_chipselect: number of chipselects supported by this SPI master - * @intr_line: interrupt line used to connect the SPI IP to the ARM interrupt - * controller withn the SoC. Possible values are 0 and 1. - * @cshold_bug: set this to true if the SPI controller on your chip requires - * a write to CSHOLD bit in between transfers (like in DM355). - * @dma_event_q: DMA event queue to use if SPI_IO_TYPE_DMA is used for any - * device on the bus. - */ -struct davinci_spi_platform_data { - u8 version; - u8 num_chipselect; - u8 intr_line; - u8 prescaler_limit; - bool cshold_bug; - enum dma_event_q dma_event_q; -}; - -/** - * davinci_spi_config - Per-chip-select configuration for SPI slave devices - * - * @wdelay: amount of delay between transmissions. Measured in number of - * SPI module clocks. - * @odd_parity: polarity of parity flag at the end of transmit data stream. - * 0 - odd parity, 1 - even parity. - * @parity_enable: enable transmission of parity at end of each transmit - * data stream. - * @io_type: type of IO transfer. Choose between polled, interrupt and DMA. - * @timer_disable: disable chip-select timers (setup and hold) - * @c2tdelay: chip-select setup time. Measured in number of SPI module clocks. - * @t2cdelay: chip-select hold time. Measured in number of SPI module clocks. - * @t2edelay: transmit data finished to SPI ENAn pin inactive time. Measured - * in number of SPI clocks. - * @c2edelay: chip-select active to SPI ENAn signal active time. Measured in - * number of SPI clocks. - */ -struct davinci_spi_config { - u8 wdelay; - u8 odd_parity; - u8 parity_enable; -#define SPI_IO_TYPE_INTR 0 -#define SPI_IO_TYPE_POLL 1 -#define SPI_IO_TYPE_DMA 2 - u8 io_type; - u8 timer_disable; - u8 c2tdelay; - u8 t2cdelay; - u8 t2edelay; - u8 c2edelay; -}; - -#endif /* __ARCH_ARM_DAVINCI_SPI_H */ -- cgit v1.2.3 From f49ae86483c494ddc793d889f6df5ea68d138569 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 17 Nov 2025 10:47:54 +0000 Subject: memregion: Drop unused IORES_DESC_* parameter from cpu_cache_invalidate_memregion() The res_desc parameter was originally introduced for documentation purposes and with the idea that with HDM-DB CXL invalidation could be triggered from the device. That has not come to pass and the continued existence of the option is confusing when we add a range in the following patch which might not be a strict subset of the res_desc. So avoid that confusion by dropping the parameter. Link: https://lore.kernel.org/linux-mm/686eedb25ed02_24471002e@dwillia2-xfh.jf.intel.com.notmuch/ Reviewed-by: Dan Williams Suggested-by: Dan Williams Signed-off-by: Jonathan Cameron Signed-off-by: Conor Dooley --- arch/x86/mm/pat/set_memory.c | 2 +- drivers/cxl/core/region.c | 2 +- drivers/nvdimm/region.c | 2 +- drivers/nvdimm/region_devs.c | 2 +- include/linux/memregion.h | 7 +++---- 5 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 8834c76f91c9..4019b17fb65e 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -368,7 +368,7 @@ bool cpu_cache_has_invalidate_memregion(void) } EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); -int cpu_cache_invalidate_memregion(int res_desc) +int cpu_cache_invalidate_memregion(void) { if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) return -ENXIO; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 71cc42d05248..d7fa76810f82 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -228,7 +228,7 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr) return -ENXIO; } - cpu_cache_invalidate_memregion(IORES_DESC_CXL); + cpu_cache_invalidate_memregion(); return 0; } diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 88dc062af5f8..c43506448edf 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -110,7 +110,7 @@ static void nd_region_remove(struct device *dev) * here is ok. */ if (cpu_cache_has_invalidate_memregion()) - cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY); + cpu_cache_invalidate_memregion(); } static int child_notify(struct device *dev, void *data) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index de1ee5ebc851..3cdd93d40997 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -90,7 +90,7 @@ static int nd_region_invalidate_memregion(struct nd_region *nd_region) } } - cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY); + cpu_cache_invalidate_memregion(); out: for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; diff --git a/include/linux/memregion.h b/include/linux/memregion.h index c01321467789..945646bde825 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -26,8 +26,7 @@ static inline void memregion_free(int id) /** * cpu_cache_invalidate_memregion - drop any CPU cached data for - * memregions described by @res_desc - * @res_desc: one of the IORES_DESC_* types + * memregion * * Perform cache maintenance after a memory event / operation that * changes the contents of physical memory in a cache-incoherent manner. @@ -46,7 +45,7 @@ static inline void memregion_free(int id) * the cache maintenance. */ #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION -int cpu_cache_invalidate_memregion(int res_desc); +int cpu_cache_invalidate_memregion(void); bool cpu_cache_has_invalidate_memregion(void); #else static inline bool cpu_cache_has_invalidate_memregion(void) @@ -54,7 +53,7 @@ static inline bool cpu_cache_has_invalidate_memregion(void) return false; } -static inline int cpu_cache_invalidate_memregion(int res_desc) +static inline int cpu_cache_invalidate_memregion(void) { WARN_ON_ONCE("CPU cache invalidation required"); return -ENXIO; -- cgit v1.2.3 From b43652d867cf2a5f31b14e3d9a320ad01fca0992 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 17 Nov 2025 10:47:55 +0000 Subject: memregion: Support fine grained invalidate by cpu_cache_invalidate_memregion() Extend cpu_cache_invalidate_memregion() to support invalidating a particular range of memory by introducing start and length parameters. Control of types of invalidation is left for when use cases turn up. For now everything is Clean and Invalidate. Where the range is unknown, use the provided cpu_cache_invalidate_all() helper to act as documentation of intent in a fashion that is clearer than passing (0, -1) to cpu_cache_invalidate_memregion(). Signed-off-by: Yicong Yang Reviewed-by: Dan Williams Acked-by: Davidlohr Bueso Signed-off-by: Jonathan Cameron Signed-off-by: Conor Dooley --- arch/x86/mm/pat/set_memory.c | 2 +- drivers/cxl/core/region.c | 5 ++++- drivers/nvdimm/region.c | 2 +- drivers/nvdimm/region_devs.c | 2 +- include/linux/memregion.h | 13 +++++++++++-- 5 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 4019b17fb65e..292c7202faed 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -368,7 +368,7 @@ bool cpu_cache_has_invalidate_memregion(void) } EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); -int cpu_cache_invalidate_memregion(void) +int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len) { if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) return -ENXIO; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d7fa76810f82..410e41cef5d3 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -228,7 +228,10 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr) return -ENXIO; } - cpu_cache_invalidate_memregion(); + if (!cxlr->params.res) + return -ENXIO; + cpu_cache_invalidate_memregion(cxlr->params.res->start, + resource_size(cxlr->params.res)); return 0; } diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index c43506448edf..42e982db5b04 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -110,7 +110,7 @@ static void nd_region_remove(struct device *dev) * here is ok. */ if (cpu_cache_has_invalidate_memregion()) - cpu_cache_invalidate_memregion(); + cpu_cache_invalidate_all(); } static int child_notify(struct device *dev, void *data) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 3cdd93d40997..e27fc380f6c0 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -90,7 +90,7 @@ static int nd_region_invalidate_memregion(struct nd_region *nd_region) } } - cpu_cache_invalidate_memregion(); + cpu_cache_invalidate_all(); out: for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; diff --git a/include/linux/memregion.h b/include/linux/memregion.h index 945646bde825..a55f62cc5266 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -27,6 +27,9 @@ static inline void memregion_free(int id) /** * cpu_cache_invalidate_memregion - drop any CPU cached data for * memregion + * @start: start physical address of the target memory region. + * @len: length of the target memory region. -1 for all the regions of + * the target type. * * Perform cache maintenance after a memory event / operation that * changes the contents of physical memory in a cache-incoherent manner. @@ -45,7 +48,7 @@ static inline void memregion_free(int id) * the cache maintenance. */ #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION -int cpu_cache_invalidate_memregion(void); +int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len); bool cpu_cache_has_invalidate_memregion(void); #else static inline bool cpu_cache_has_invalidate_memregion(void) @@ -53,10 +56,16 @@ static inline bool cpu_cache_has_invalidate_memregion(void) return false; } -static inline int cpu_cache_invalidate_memregion(void) +static inline int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len) { WARN_ON_ONCE("CPU cache invalidation required"); return -ENXIO; } #endif + +static inline int cpu_cache_invalidate_all(void) +{ + return cpu_cache_invalidate_memregion(0, -1); +} + #endif /* _MEMREGION_H_ */ -- cgit v1.2.3 From 24afd7827efb7c69adfc41835390470e3eec4740 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Fri, 14 Nov 2025 08:38:04 +0800 Subject: net: phy: Add helper for fixing RGMII PHY mode based on internal mac delay The "phy-mode" property of devicetree indicates whether the PCB has delay now, which means the mac needs to modify the PHY mode based on whether there is an internal delay in the mac. This modification is similar for many ethernet drivers. To simplify code, define the helper phy_fix_phy_mode_for_mac_delays(speed, mac_txid, mac_rxid) to fix PHY mode based on whether mac adds internal delay. Suggested-by: Russell King (Oracle) Signed-off-by: Inochi Amaoto Reviewed-by: Maxime Chevallier Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251114003805.494387-3-inochiama@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy-core.c | 43 +++++++++++++++++++++++++++++++++++++++++++ include/linux/phy.h | 3 +++ 2 files changed, 46 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 605ca20ae192..0c63e6ba2cb0 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -101,6 +101,49 @@ const char *phy_rate_matching_to_str(int rate_matching) } EXPORT_SYMBOL_GPL(phy_rate_matching_to_str); +/** + * phy_fix_phy_mode_for_mac_delays - Convenience function for fixing PHY + * mode based on whether mac adds internal delay + * + * @interface: The current interface mode of the port + * @mac_txid: True if the mac adds internal tx delay + * @mac_rxid: True if the mac adds internal rx delay + * + * Return: fixed PHY mode, or PHY_INTERFACE_MODE_NA if the interface can + * not apply the internal delay + */ +phy_interface_t phy_fix_phy_mode_for_mac_delays(phy_interface_t interface, + bool mac_txid, bool mac_rxid) +{ + if (!phy_interface_mode_is_rgmii(interface)) + return interface; + + if (mac_txid && mac_rxid) { + if (interface == PHY_INTERFACE_MODE_RGMII_ID) + return PHY_INTERFACE_MODE_RGMII; + return PHY_INTERFACE_MODE_NA; + } + + if (mac_txid) { + if (interface == PHY_INTERFACE_MODE_RGMII_ID) + return PHY_INTERFACE_MODE_RGMII_RXID; + if (interface == PHY_INTERFACE_MODE_RGMII_TXID) + return PHY_INTERFACE_MODE_RGMII; + return PHY_INTERFACE_MODE_NA; + } + + if (mac_rxid) { + if (interface == PHY_INTERFACE_MODE_RGMII_ID) + return PHY_INTERFACE_MODE_RGMII_TXID; + if (interface == PHY_INTERFACE_MODE_RGMII_RXID) + return PHY_INTERFACE_MODE_RGMII; + return PHY_INTERFACE_MODE_NA; + } + + return interface; +} +EXPORT_SYMBOL_GPL(phy_fix_phy_mode_for_mac_delays); + /** * phy_interface_num_ports - Return the number of links that can be carried by * a given MAC-PHY physical link. Returns 0 if this is diff --git a/include/linux/phy.h b/include/linux/phy.h index bf5457341ca8..65b0c3ca6a2b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2040,6 +2040,9 @@ static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev) return phydev->is_pseudo_fixed_link; } +phy_interface_t phy_fix_phy_mode_for_mac_delays(phy_interface_t interface, + bool mac_txid, bool mac_rxid); + int phy_save_page(struct phy_device *phydev); int phy_select_page(struct phy_device *phydev, int page); int phy_restore_page(struct phy_device *phydev, int oldpage, int ret); -- cgit v1.2.3 From fc45aee66223253ec5547094d7552819914abdfb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 10 Mar 2025 00:06:29 -0400 Subject: get rid of kill_litter_super() Not used anymore. Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 7 +++++++ fs/dcache.c | 21 --------------------- fs/internal.h | 1 - fs/super.c | 8 -------- include/linux/dcache.h | 1 - include/linux/fs.h | 1 - 6 files changed, 7 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 7233b04668fc..4921b3b0662a 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1309,3 +1309,10 @@ a different length, use vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len)) instead. + +--- + +**mandatory** + +kill_litter_super() is gone; convert to DCACHE_PERSISTENT use (as all +in-tree filesystems have done). diff --git a/fs/dcache.c b/fs/dcache.c index 3cc6c3876177..5ee2e78a91b3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3167,27 +3167,6 @@ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) } EXPORT_SYMBOL(is_subdir); -static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) -{ - struct dentry *root = data; - if (dentry != root) { - if (d_unhashed(dentry) || !dentry->d_inode || - dentry->d_flags & DCACHE_PERSISTENT) - return D_WALK_SKIP; - - if (!(dentry->d_flags & DCACHE_GENOCIDE)) { - dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_lockref.count--; - } - } - return D_WALK_CONTINUE; -} - -void d_genocide(struct dentry *parent) -{ - d_walk(parent, parent, d_genocide_kill); -} - void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; diff --git a/fs/internal.h b/fs/internal.h index 9b2b4d116880..144686af6c36 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -227,7 +227,6 @@ extern void shrink_dcache_for_umount(struct super_block *); extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); -extern void d_genocide(struct dentry *); /* * pipe.c diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..ee001f684d2a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1284,14 +1284,6 @@ void kill_anon_super(struct super_block *sb) } EXPORT_SYMBOL(kill_anon_super); -void kill_litter_super(struct super_block *sb) -{ - if (sb->s_root) - d_genocide(sb->s_root); - kill_anon_super(sb); -} -EXPORT_SYMBOL(kill_litter_super); - int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6ec4066825e3..20a85144a00e 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -198,7 +198,6 @@ enum dentry_flags { DCACHE_REFERENCED = BIT(6), /* Recently used, don't discard. */ DCACHE_DONTCACHE = BIT(7), /* Purge from memory on final dput() */ DCACHE_CANT_MOUNT = BIT(8), - DCACHE_GENOCIDE = BIT(9), DCACHE_SHRINK_LIST = BIT(10), DCACHE_OP_WEAK_REVALIDATE = BIT(11), /* diff --git a/include/linux/fs.h b/include/linux/fs.h index f5037c556f61..95933ceaae51 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2728,7 +2728,6 @@ void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); -void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); -- cgit v1.2.3 From ca459ca70f60ce05445845eca74c788b0d5ddb1b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 25 Oct 2025 18:34:49 -0400 Subject: kill securityfs_recursive_remove() it's an unused alias for securityfs_remove() Acked-by: Paul Moore Signed-off-by: Al Viro --- include/linux/security.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 92ac3f27b973..9e710cfee744 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2258,8 +2258,6 @@ static inline void securityfs_remove(struct dentry *dentry) #endif -#define securityfs_recursive_remove securityfs_remove - #ifdef CONFIG_BPF_SYSCALL union bpf_attr; struct bpf_map; -- cgit v1.2.3 From eb028c33451af08bb34f45c6be6967ef1c98cbd1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Oct 2025 18:32:21 -0400 Subject: d_make_discardable(): warn if given a non-persistent dentry At this point there are very few call chains that might lead to d_make_discardable() on a dentry that hadn't been made persistent: calls of simple_unlink() and simple_rmdir() in configfs and apparmorfs. Both filesystems do pin (part of) their contents in dcache, but they are currently playing very unusual games with that. Converting them to more usual patterns might be possible, but it's definitely going to be a long series of changes in both cases. For now the easiest solution is to have both stop using simple_unlink() and simple_rmdir() - that allows to make d_make_discardable() warn when given a non-persistent dentry. Rather than giving them full-blown private copies (with calls of d_make_discardable() replaced with dput()), let's pull the parts of simple_unlink() and simple_rmdir() that deal with timestamps and link counts into separate helpers (__simple_unlink() and __simple_rmdir() resp.) and have those used by configfs and apparmorfs. Signed-off-by: Al Viro --- fs/configfs/dir.c | 10 ++++++++-- fs/configfs/inode.c | 3 ++- fs/dcache.c | 9 +-------- fs/libfs.c | 21 +++++++++++++++++---- include/linux/fs.h | 2 ++ security/apparmor/apparmorfs.c | 13 +++++++++---- 6 files changed, 39 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 81f4f06bc87e..e8f2f44012e9 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -400,8 +400,14 @@ static void remove_dir(struct dentry * d) configfs_remove_dirent(d); - if (d_really_is_positive(d)) - simple_rmdir(d_inode(parent),d); + if (d_really_is_positive(d)) { + if (likely(simple_empty(d))) { + __simple_rmdir(d_inode(parent),d); + dput(d); + } else { + pr_warn("remove_dir (%pd): attributes remain", d); + } + } pr_debug(" o %pd removing done (%d)\n", d, d_count(d)); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 1d2e3a5738d1..bcda3372e141 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -211,7 +211,8 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - simple_unlink(d_inode(parent), dentry); + __simple_unlink(d_inode(parent), dentry); + dput(dentry); } else spin_unlock(&dentry->d_lock); } diff --git a/fs/dcache.c b/fs/dcache.c index 5ee2e78a91b3..824d620bb563 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -931,14 +931,7 @@ EXPORT_SYMBOL(dput); void d_make_discardable(struct dentry *dentry) { spin_lock(&dentry->d_lock); - /* - * By the end of the series we'll add - * WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT); - * here, but while object removal is done by a few common helpers, - * object creation tends to be open-coded (if nothing else, new inode - * needs to be set up), so adding a warning from the very beginning - * would make for much messier patch series. - */ + WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT)); dentry->d_flags &= ~DCACHE_PERSISTENT; dentry->d_lockref.count--; rcu_read_lock(); diff --git a/fs/libfs.c b/fs/libfs.c index 80f288a771e3..0aa630e7eb00 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -790,13 +790,27 @@ out: } EXPORT_SYMBOL(simple_empty); -int simple_unlink(struct inode *dir, struct dentry *dentry) +void __simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); drop_nlink(inode); +} +EXPORT_SYMBOL(__simple_unlink); + +void __simple_rmdir(struct inode *dir, struct dentry *dentry) +{ + drop_nlink(d_inode(dentry)); + __simple_unlink(dir, dentry); + drop_nlink(dir); +} +EXPORT_SYMBOL(__simple_rmdir); + +int simple_unlink(struct inode *dir, struct dentry *dentry) +{ + __simple_unlink(dir, dentry); d_make_discardable(dentry); return 0; } @@ -807,9 +821,8 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; - drop_nlink(d_inode(dentry)); - simple_unlink(dir, dentry); - drop_nlink(dir); + __simple_rmdir(dir, dentry); + d_make_discardable(dentry); return 0; } EXPORT_SYMBOL(simple_rmdir); diff --git a/include/linux/fs.h b/include/linux/fs.h index 95933ceaae51..ef842adbd418 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3621,6 +3621,8 @@ extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); +extern void __simple_unlink(struct inode *, struct dentry *); +extern void __simple_rmdir(struct inode *, struct dentry *); void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 391a586d0557..9b9090d38ea2 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -358,10 +358,15 @@ static void aafs_remove(struct dentry *dentry) dir = d_inode(dentry->d_parent); inode_lock(dir); if (simple_positive(dentry)) { - if (d_is_dir(dentry)) - simple_rmdir(dir, dentry); - else - simple_unlink(dir, dentry); + if (d_is_dir(dentry)) { + if (!WARN_ON(!simple_empty(dentry))) { + __simple_rmdir(dir, dentry); + dput(dentry); + } + } else { + __simple_unlink(dir, dentry); + dput(dentry); + } d_delete(dentry); dput(dentry); } -- cgit v1.2.3 From 7dc211c1159d991db609bdf4b0fb9033c04adcbc Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 15 Nov 2025 10:23:43 +0000 Subject: bpf: Fix invalid prog->stats access when update_effective_progs fails Syzkaller triggers an invalid memory access issue following fault injection in update_effective_progs. The issue can be described as follows: __cgroup_bpf_detach update_effective_progs compute_effective_progs bpf_prog_array_alloc <-- fault inject purge_effective_progs /* change to dummy_bpf_prog */ array->items[index] = &dummy_bpf_prog.prog ---softirq start--- __do_softirq ... __cgroup_bpf_run_filter_skb __bpf_prog_run_save_cb bpf_prog_run stats = this_cpu_ptr(prog->stats) /* invalid memory access */ flags = u64_stats_update_begin_irqsave(&stats->syncp) ---softirq end--- static_branch_dec(&cgroup_bpf_enabled_key[atype]) The reason is that fault injection caused update_effective_progs to fail and then changed the original prog into dummy_bpf_prog.prog in purge_effective_progs. Then a softirq came, and accessing the members of dummy_bpf_prog.prog in the softirq triggers invalid mem access. To fix it, skip updating stats when stats is NULL. Fixes: 492ecee892c2 ("bpf: enable program stats") Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20251115102343.2200727-1-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 12 +++++++----- kernel/bpf/syscall.c | 3 +++ 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 973233b82dc1..569de3b14279 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -712,11 +712,13 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, ret = dfunc(ctx, prog->insnsi, prog->bpf_func); duration = sched_clock() - start; - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, duration); - u64_stats_update_end_irqrestore(&stats->syncp, flags); + if (likely(prog->stats)) { + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, duration); + u64_stats_update_end_irqrestore(&stats->syncp, flags); + } } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a2a441185f81..792623a7c90b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2463,6 +2463,9 @@ void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) struct bpf_prog_stats *stats; unsigned int flags; + if (unlikely(!prog->stats)) + return; + stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); -- cgit v1.2.3 From 9c7dacf5d51910f34a3bd709403f6a82ffc8c960 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:14 +0100 Subject: platform/x86: asus-armoury: add apu-mem control support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the APU memory size control under the asus-armoury module using the fw_attributes class. This allows the APU allocated memory size to be adjusted depending on the users priority. A reboot is required after change. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Link: https://patch.msgid.link/20251102215319.3126879-5-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-armoury.c | 98 ++++++++++++++++++++++++++++++ include/linux/platform_data/x86/asus-wmi.h | 2 + 2 files changed, 100 insertions(+) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c index f0cb973a487e..1b972260c5dd 100644 --- a/drivers/platform/x86/asus-armoury.c +++ b/drivers/platform/x86/asus-armoury.c @@ -174,6 +174,7 @@ static int armoury_get_devstate(struct kobj_attribute *attr, u32 *retval, u32 de * and should perform relevant checks. * * Returns: + * * %-EINVAL - attempt to set a dangerous or unsupported value. * * %-EIO - WMI function returned an error. * * %0 - successful and retval is filled. * * %other - error from WMI call. @@ -184,6 +185,26 @@ static int armoury_set_devstate(struct kobj_attribute *attr, u32 result; int err; + /* + * Prevent developers from bricking devices or issuing dangerous + * commands that can be difficult or impossible to recover from. + */ + switch (dev_id) { + case ASUS_WMI_DEVID_APU_MEM: + /* + * A hard reset might suffice to save the device, + * but there is no value in sending these commands. + */ + if (value == 0x100 || value == 0x101) { + pr_err("Refusing to set APU memory to unsafe value: 0x%x\n", value); + return -EINVAL; + } + break; + default: + /* No problems are known for this dev_id */ + break; + } + err = asus_wmi_set_devstate(dev_id, value, retval ? retval : &result); if (err) { if (attr) @@ -599,6 +620,82 @@ static ssize_t egpu_enable_possible_values_show(struct kobject *kobj, struct kob } ASUS_ATTR_GROUP_ENUM(egpu_enable, "egpu_enable", "Enable the eGPU (also disables dGPU)"); +/* Device memory available to APU */ + +/* + * Values map for APU reserved memory (index + 1 number of GB). + * Some looks out of order, but are actually correct. + */ +static u32 apu_mem_map[] = { + [0] = 0x000, /* called "AUTO" on the BIOS, is the minimum available */ + [1] = 0x102, + [2] = 0x103, + [3] = 0x104, + [4] = 0x105, + [5] = 0x107, + [6] = 0x108, + [7] = 0x109, + [8] = 0x106, +}; + +static ssize_t apu_mem_current_value_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int err; + u32 mem; + + err = armoury_get_devstate(attr, &mem, ASUS_WMI_DEVID_APU_MEM); + if (err) + return err; + + /* After 0x000 is set, a read will return 0x100 */ + if (mem == 0x100) + return sysfs_emit(buf, "0\n"); + + for (unsigned int i = 0; i < ARRAY_SIZE(apu_mem_map); i++) { + if (apu_mem_map[i] == mem) + return sysfs_emit(buf, "%u\n", i); + } + + pr_warn("Unrecognised value for APU mem 0x%08x\n", mem); + return -EIO; +} + +static ssize_t apu_mem_current_value_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int result, err; + u32 requested, mem; + + result = kstrtou32(buf, 10, &requested); + if (result) + return result; + + if (requested >= ARRAY_SIZE(apu_mem_map)) + return -EINVAL; + mem = apu_mem_map[requested]; + + err = armoury_set_devstate(attr, mem, NULL, ASUS_WMI_DEVID_APU_MEM); + if (err) { + pr_warn("Failed to set apu_mem 0x%x: %d\n", mem, err); + return err; + } + + pr_info("APU memory changed to %uGB, reboot required\n", requested + 1); + sysfs_notify(kobj, NULL, attr->attr.name); + + asus_set_reboot_and_signal_event(); + + return count; +} + +static ssize_t apu_mem_possible_values_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return armoury_attr_enum_list(buf, ARRAY_SIZE(apu_mem_map)); +} +ASUS_ATTR_GROUP_ENUM(apu_mem, "apu_mem", "Set available system RAM (in GB) for the APU to use"); + /* Simple attribute creation */ ASUS_ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2\n", "Show the current mode of charging"); @@ -618,6 +715,7 @@ static const struct asus_attr_group armoury_attr_groups[] = { { &egpu_connected_attr_group, ASUS_WMI_DEVID_EGPU_CONNECTED }, { &egpu_enable_attr_group, ASUS_WMI_DEVID_EGPU }, { &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU }, + { &apu_mem_attr_group, ASUS_WMI_DEVID_APU_MEM }, { &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE }, { &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND }, diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 3cc235b20be4..9a6433d08973 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -136,6 +136,8 @@ /* dgpu on/off */ #define ASUS_WMI_DEVID_DGPU 0x00090020 +#define ASUS_WMI_DEVID_APU_MEM 0x000600C1 + /* gpu mux switch, 0 = dGPU, 1 = Optimus */ #define ASUS_WMI_DEVID_GPU_MUX 0x00090016 #define ASUS_WMI_DEVID_GPU_MUX_VIVO 0x00090026 -- cgit v1.2.3 From 7725a2dc58632cb44eeef2e5b959ab7b7931298d Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:16 +0100 Subject: platform/x86: asus-armoury: add screen auto-brightness toggle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add screen_auto_brightness toggle supported on some laptops. Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/20251102215319.3126879-7-denis.benato@linux.dev Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-armoury.c | 4 ++++ include/linux/platform_data/x86/asus-wmi.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c index 1b972260c5dd..c1dbaed409d2 100644 --- a/drivers/platform/x86/asus-armoury.c +++ b/drivers/platform/x86/asus-armoury.c @@ -707,6 +707,9 @@ ASUS_ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD, "Set the panel refresh overdrive"); ASUS_ATTR_GROUP_BOOL_RW(panel_hd_mode, "panel_hd_mode", ASUS_WMI_DEVID_PANEL_HD, "Set the panel HD mode to UHD<0> or FHD<1>"); +ASUS_ATTR_GROUP_BOOL_RW(screen_auto_brightness, "screen_auto_brightness", + ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS, + "Set the panel brightness to Off<0> or On<1>"); ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED, "Show the eGPU connection status"); @@ -722,6 +725,7 @@ static const struct asus_attr_group armoury_attr_groups[] = { { &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE }, { &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD }, { &panel_hd_mode_attr_group, ASUS_WMI_DEVID_PANEL_HD }, + { &screen_auto_brightness_attr_group, ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS }, }; static int asus_fw_attr_add(void) diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 9a6433d08973..3af075baf9f7 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -82,6 +82,7 @@ #define ASUS_WMI_DEVID_LID_FLIP_ROG 0x00060077 #define ASUS_WMI_DEVID_MINI_LED_MODE 0x0005001E #define ASUS_WMI_DEVID_MINI_LED_MODE2 0x0005002E +#define ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS 0x0005002A /* Storage */ #define ASUS_WMI_DEVID_CARDREADER 0x00080013 -- cgit v1.2.3 From d849a9f2380d5287d5133eac5bae602a147b86c2 Mon Sep 17 00:00:00 2001 From: Denis Benato Date: Sun, 2 Nov 2025 22:53:18 +0100 Subject: platform/x86: asus-wmi: rename ASUS_WMI_DEVID_PPT_FPPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintain power-related WMI macros naming consistency: rename ASUS_WMI_DEVID_PPT_FPPT to ASUS_WMI_DEVID_PPT_PL3_FPPT. Link: https://lore.kernel.org/all/cad7b458-5a7a-4975-94a1-d0c74f6f3de5@oracle.com/ Suggested-by: ALOK TIWARI Signed-off-by: Denis Benato Link: https://.../ Link: https://patch.msgid.link/20251102215319.3126879-9-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-wmi.c | 4 ++-- include/linux/platform_data/x86/asus-wmi.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 6de633d4a748..64cfc0bf98dd 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -1218,7 +1218,7 @@ static ssize_t ppt_fppt_store(struct device *dev, if (value < PPT_TOTAL_MIN || value > PPT_TOTAL_MAX) return -EINVAL; - err = asus_wmi_set_devstate(ASUS_WMI_DEVID_PPT_FPPT, value, &result); + err = asus_wmi_set_devstate(ASUS_WMI_DEVID_PPT_PL3_FPPT, value, &result); if (err) { pr_warn("Failed to set ppt_fppt: %d\n", err); return err; @@ -4602,7 +4602,7 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj, else if (attr == &dev_attr_ppt_pl1_spl.attr) devid = ASUS_WMI_DEVID_PPT_PL1_SPL; else if (attr == &dev_attr_ppt_fppt.attr) - devid = ASUS_WMI_DEVID_PPT_FPPT; + devid = ASUS_WMI_DEVID_PPT_PL3_FPPT; else if (attr == &dev_attr_ppt_apu_sppt.attr) devid = ASUS_WMI_DEVID_PPT_APU_SPPT; else if (attr == &dev_attr_ppt_platform_sppt.attr) diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 3af075baf9f7..e7c95e9d29db 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -107,7 +107,7 @@ #define ASUS_WMI_DEVID_PPT_PL1_SPL 0x001200A3 #define ASUS_WMI_DEVID_PPT_APU_SPPT 0x001200B0 #define ASUS_WMI_DEVID_PPT_PLAT_SPPT 0x001200B1 -#define ASUS_WMI_DEVID_PPT_FPPT 0x001200C1 +#define ASUS_WMI_DEVID_PPT_PL3_FPPT 0x001200C1 #define ASUS_WMI_DEVID_NV_DYN_BOOST 0x001200C0 #define ASUS_WMI_DEVID_NV_THERM_TARGET 0x001200C2 -- cgit v1.2.3 From 39ae6c50e599aa0cf62ea3d0dcf06492f7690ed7 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:19 +0100 Subject: platform/x86: asus-armoury: add ppt_* and nv_* tuning knobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the ppt_* and nv_* tuning knobs that are available via WMI methods and adds proper min/max levels plus defaults. The min/max are defined by ASUS and typically gained by looking at what they allow in the ASUS Armoury Crate application - ASUS does not share the values outside of this. It could also be possible to gain the AMD values by use of ryzenadj and testing for the minimum stable value. The general rule of thumb for adding to the match table is that if the model range has a single CPU used throughout, then the DMI match can omit the last letter of the model number as this is the GPU model. If a min or max value is not provided it is assumed that the particular setting is not supported. for example ppt_pl2_sppt_min/max is not set. If a _def is not set then the default is assumed to be _max It is assumed that at least AC settings are available so that the firmware attributes will be created - if no DC table is available and power is on DC, then reading the attributes is -ENODEV. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Tested-by: Mateusz Schyboll Tested-by: Porfet Lillian Link: https://patch.msgid.link/20251102215319.3126879-10-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-armoury.c | 302 ++++++- drivers/platform/x86/asus-armoury.h | 1294 ++++++++++++++++++++++++++++ include/linux/platform_data/x86/asus-wmi.h | 3 + 3 files changed, 1593 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c index c1dbaed409d2..d6aba68515e2 100644 --- a/drivers/platform/x86/asus-armoury.c +++ b/drivers/platform/x86/asus-armoury.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "asus-armoury.h" @@ -48,6 +49,33 @@ #define ASUS_MINI_LED_2024_STRONG 0x01 #define ASUS_MINI_LED_2024_OFF 0x02 +/* Power tunable attribute name defines */ +#define ATTR_PPT_PL1_SPL "ppt_pl1_spl" +#define ATTR_PPT_PL2_SPPT "ppt_pl2_sppt" +#define ATTR_PPT_PL3_FPPT "ppt_pl3_fppt" +#define ATTR_PPT_APU_SPPT "ppt_apu_sppt" +#define ATTR_PPT_PLATFORM_SPPT "ppt_platform_sppt" +#define ATTR_NV_DYNAMIC_BOOST "nv_dynamic_boost" +#define ATTR_NV_TEMP_TARGET "nv_temp_target" +#define ATTR_NV_BASE_TGP "nv_base_tgp" +#define ATTR_NV_TGP "nv_tgp" + +#define ASUS_ROG_TUNABLE_DC 0 +#define ASUS_ROG_TUNABLE_AC 1 + +struct rog_tunables { + const struct power_limits *power_limits; + u32 ppt_pl1_spl; // cpu + u32 ppt_pl2_sppt; // cpu + u32 ppt_pl3_fppt; // cpu + u32 ppt_apu_sppt; // plat + u32 ppt_platform_sppt; // plat + + u32 nv_dynamic_boost; + u32 nv_temp_target; + u32 nv_tgp; +}; + struct asus_armoury_priv { struct device *fw_attr_dev; struct kset *fw_attr_kset; @@ -60,6 +88,9 @@ struct asus_armoury_priv { */ struct mutex egpu_mutex; + /* Index 0 for DC, 1 for AC */ + struct rog_tunables *rog_tunables[2]; + u32 mini_led_dev_id; u32 gpu_mux_dev_id; }; @@ -290,6 +321,12 @@ static ssize_t enum_type_show(struct kobject *kobj, struct kobj_attribute *attr, return sysfs_emit(buf, "enumeration\n"); } +static ssize_t int_type_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "integer\n"); +} + /* Mini-LED mode **************************************************************/ /* Values map for mini-led modes on 2023 and earlier models. */ @@ -696,6 +733,15 @@ static ssize_t apu_mem_possible_values_show(struct kobject *kobj, struct kobj_at } ASUS_ATTR_GROUP_ENUM(apu_mem, "apu_mem", "Set available system RAM (in GB) for the APU to use"); +/* Define helper to access the current power mode tunable values */ +static inline struct rog_tunables *get_current_tunables(void) +{ + if (power_supply_is_system_supplied()) + return asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC]; + + return asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC]; +} + /* Simple attribute creation */ ASUS_ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2\n", "Show the current mode of charging"); @@ -712,6 +758,24 @@ ASUS_ATTR_GROUP_BOOL_RW(screen_auto_brightness, "screen_auto_brightness", "Set the panel brightness to Off<0> or On<1>"); ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED, "Show the eGPU connection status"); +ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_pl1_spl, ATTR_PPT_PL1_SPL, ASUS_WMI_DEVID_PPT_PL1_SPL, + "Set the CPU slow package limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_pl2_sppt, ATTR_PPT_PL2_SPPT, ASUS_WMI_DEVID_PPT_PL2_SPPT, + "Set the CPU fast package limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_pl3_fppt, ATTR_PPT_PL3_FPPT, ASUS_WMI_DEVID_PPT_PL3_FPPT, + "Set the CPU fastest package limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_apu_sppt, ATTR_PPT_APU_SPPT, ASUS_WMI_DEVID_PPT_APU_SPPT, + "Set the APU package limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_platform_sppt, ATTR_PPT_PLATFORM_SPPT, ASUS_WMI_DEVID_PPT_PLAT_SPPT, + "Set the platform package limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(nv_dynamic_boost, ATTR_NV_DYNAMIC_BOOST, ASUS_WMI_DEVID_NV_DYN_BOOST, + "Set the Nvidia dynamic boost limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(nv_temp_target, ATTR_NV_TEMP_TARGET, ASUS_WMI_DEVID_NV_THERM_TARGET, + "Set the Nvidia max thermal limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(nv_tgp, "nv_tgp", ASUS_WMI_DEVID_DGPU_SET_TGP, + "Set the additional TGP on top of the base TGP"); +ASUS_ATTR_GROUP_INT_VALUE_ONLY_RO(nv_base_tgp, ATTR_NV_BASE_TGP, ASUS_WMI_DEVID_DGPU_BASE_TGP, + "Read the base TGP value"); /* If an attribute does not require any special case handling add it here */ static const struct asus_attr_group armoury_attr_groups[] = { @@ -720,6 +784,16 @@ static const struct asus_attr_group armoury_attr_groups[] = { { &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU }, { &apu_mem_attr_group, ASUS_WMI_DEVID_APU_MEM }, + { &ppt_pl1_spl_attr_group, ASUS_WMI_DEVID_PPT_PL1_SPL }, + { &ppt_pl2_sppt_attr_group, ASUS_WMI_DEVID_PPT_PL2_SPPT }, + { &ppt_pl3_fppt_attr_group, ASUS_WMI_DEVID_PPT_PL3_FPPT }, + { &ppt_apu_sppt_attr_group, ASUS_WMI_DEVID_PPT_APU_SPPT }, + { &ppt_platform_sppt_attr_group, ASUS_WMI_DEVID_PPT_PLAT_SPPT }, + { &nv_dynamic_boost_attr_group, ASUS_WMI_DEVID_NV_DYN_BOOST }, + { &nv_temp_target_attr_group, ASUS_WMI_DEVID_NV_THERM_TARGET }, + { &nv_base_tgp_attr_group, ASUS_WMI_DEVID_DGPU_BASE_TGP }, + { &nv_tgp_attr_group, ASUS_WMI_DEVID_DGPU_SET_TGP }, + { &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE }, { &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND }, { &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE }, @@ -728,8 +802,76 @@ static const struct asus_attr_group armoury_attr_groups[] = { { &screen_auto_brightness_attr_group, ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS }, }; +/** + * is_power_tunable_attr - Determines if an attribute is a power-related tunable + * @name: The name of the attribute to check + * + * This function checks if the given attribute name is related to power tuning. + * + * Return: true if the attribute is a power-related tunable, false otherwise + */ +static bool is_power_tunable_attr(const char *name) +{ + static const char * const power_tunable_attrs[] = { + ATTR_PPT_PL1_SPL, ATTR_PPT_PL2_SPPT, + ATTR_PPT_PL3_FPPT, ATTR_PPT_APU_SPPT, + ATTR_PPT_PLATFORM_SPPT, ATTR_NV_DYNAMIC_BOOST, + ATTR_NV_TEMP_TARGET, ATTR_NV_BASE_TGP, + ATTR_NV_TGP + }; + + for (unsigned int i = 0; i < ARRAY_SIZE(power_tunable_attrs); i++) { + if (!strcmp(name, power_tunable_attrs[i])) + return true; + } + + return false; +} + +/** + * has_valid_limit - Checks if a power-related attribute has a valid limit value + * @name: The name of the attribute to check + * @limits: Pointer to the power_limits structure containing limit values + * + * This function checks if a power-related attribute has a valid limit value. + * It returns false if limits is NULL or if the corresponding limit value is zero. + * + * Return: true if the attribute has a valid limit value, false otherwise + */ +static bool has_valid_limit(const char *name, const struct power_limits *limits) +{ + u32 limit_value = 0; + + if (!limits) + return false; + + if (!strcmp(name, ATTR_PPT_PL1_SPL)) + limit_value = limits->ppt_pl1_spl_max; + else if (!strcmp(name, ATTR_PPT_PL2_SPPT)) + limit_value = limits->ppt_pl2_sppt_max; + else if (!strcmp(name, ATTR_PPT_PL3_FPPT)) + limit_value = limits->ppt_pl3_fppt_max; + else if (!strcmp(name, ATTR_PPT_APU_SPPT)) + limit_value = limits->ppt_apu_sppt_max; + else if (!strcmp(name, ATTR_PPT_PLATFORM_SPPT)) + limit_value = limits->ppt_platform_sppt_max; + else if (!strcmp(name, ATTR_NV_DYNAMIC_BOOST)) + limit_value = limits->nv_dynamic_boost_max; + else if (!strcmp(name, ATTR_NV_TEMP_TARGET)) + limit_value = limits->nv_temp_target_max; + else if (!strcmp(name, ATTR_NV_BASE_TGP) || + !strcmp(name, ATTR_NV_TGP)) + limit_value = limits->nv_tgp_max; + + return limit_value > 0; +} + static int asus_fw_attr_add(void) { + const struct rog_tunables *const ac_rog_tunables = asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC]; + const struct power_limits *limits; + bool should_create; + const char *name; int err, i; asus_armoury.fw_attr_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0), @@ -786,12 +928,28 @@ static int asus_fw_attr_add(void) if (!armoury_has_devstate(armoury_attr_groups[i].wmi_devid)) continue; - err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, - armoury_attr_groups[i].attr_group); - if (err) { - pr_err("Failed to create sysfs-group for %s\n", - armoury_attr_groups[i].attr_group->name); - goto err_remove_groups; + /* Always create by default, unless PPT is not present */ + should_create = true; + name = armoury_attr_groups[i].attr_group->name; + + /* Check if this is a power-related tunable requiring limits */ + if (ac_rog_tunables && ac_rog_tunables->power_limits && + is_power_tunable_attr(name)) { + limits = ac_rog_tunables->power_limits; + /* Check only AC: if not present then DC won't be either */ + should_create = has_valid_limit(name, limits); + if (!should_create) + pr_debug("Missing max value for tunable %s\n", name); + } + + if (should_create) { + err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, + armoury_attr_groups[i].attr_group); + if (err) { + pr_err("Failed to create sysfs-group for %s\n", + armoury_attr_groups[i].attr_group->name); + goto err_remove_groups; + } } } @@ -820,6 +978,132 @@ fail_class_get: /* Init / exit ****************************************************************/ +/* Set up the min/max and defaults for ROG tunables */ +static void init_rog_tunables(void) +{ + const struct power_limits *ac_limits, *dc_limits; + struct rog_tunables *ac_rog_tunables = NULL, *dc_rog_tunables = NULL; + const struct power_data *power_data; + const struct dmi_system_id *dmi_id; + + /* Match the system against the power_limits table */ + dmi_id = dmi_first_match(power_limits); + if (!dmi_id) { + pr_warn("No matching power limits found for this system\n"); + return; + } + + /* Get the power data for this system */ + power_data = dmi_id->driver_data; + if (!power_data) { + pr_info("No power data available for this system\n"); + return; + } + + /* Initialize AC power tunables */ + ac_limits = power_data->ac_data; + if (ac_limits) { + ac_rog_tunables = kzalloc(sizeof(*asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC]), + GFP_KERNEL); + if (!ac_rog_tunables) + goto err_nomem; + + asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC] = ac_rog_tunables; + ac_rog_tunables->power_limits = ac_limits; + + /* Set initial AC values */ + ac_rog_tunables->ppt_pl1_spl = + ac_limits->ppt_pl1_spl_def ? + ac_limits->ppt_pl1_spl_def : + ac_limits->ppt_pl1_spl_max; + + ac_rog_tunables->ppt_pl2_sppt = + ac_limits->ppt_pl2_sppt_def ? + ac_limits->ppt_pl2_sppt_def : + ac_limits->ppt_pl2_sppt_max; + + ac_rog_tunables->ppt_pl3_fppt = + ac_limits->ppt_pl3_fppt_def ? + ac_limits->ppt_pl3_fppt_def : + ac_limits->ppt_pl3_fppt_max; + + ac_rog_tunables->ppt_apu_sppt = + ac_limits->ppt_apu_sppt_def ? + ac_limits->ppt_apu_sppt_def : + ac_limits->ppt_apu_sppt_max; + + ac_rog_tunables->ppt_platform_sppt = + ac_limits->ppt_platform_sppt_def ? + ac_limits->ppt_platform_sppt_def : + ac_limits->ppt_platform_sppt_max; + + ac_rog_tunables->nv_dynamic_boost = + ac_limits->nv_dynamic_boost_max; + ac_rog_tunables->nv_temp_target = + ac_limits->nv_temp_target_max; + ac_rog_tunables->nv_tgp = ac_limits->nv_tgp_max; + + pr_debug("AC power limits initialized for %s\n", dmi_id->matches[0].substr); + } else { + pr_debug("No AC PPT limits defined\n"); + } + + /* Initialize DC power tunables */ + dc_limits = power_data->dc_data; + if (dc_limits) { + dc_rog_tunables = kzalloc(sizeof(*asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC]), + GFP_KERNEL); + if (!dc_rog_tunables) { + kfree(ac_rog_tunables); + goto err_nomem; + } + + asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC] = dc_rog_tunables; + dc_rog_tunables->power_limits = dc_limits; + + /* Set initial DC values */ + dc_rog_tunables->ppt_pl1_spl = + dc_limits->ppt_pl1_spl_def ? + dc_limits->ppt_pl1_spl_def : + dc_limits->ppt_pl1_spl_max; + + dc_rog_tunables->ppt_pl2_sppt = + dc_limits->ppt_pl2_sppt_def ? + dc_limits->ppt_pl2_sppt_def : + dc_limits->ppt_pl2_sppt_max; + + dc_rog_tunables->ppt_pl3_fppt = + dc_limits->ppt_pl3_fppt_def ? + dc_limits->ppt_pl3_fppt_def : + dc_limits->ppt_pl3_fppt_max; + + dc_rog_tunables->ppt_apu_sppt = + dc_limits->ppt_apu_sppt_def ? + dc_limits->ppt_apu_sppt_def : + dc_limits->ppt_apu_sppt_max; + + dc_rog_tunables->ppt_platform_sppt = + dc_limits->ppt_platform_sppt_def ? + dc_limits->ppt_platform_sppt_def : + dc_limits->ppt_platform_sppt_max; + + dc_rog_tunables->nv_dynamic_boost = + dc_limits->nv_dynamic_boost_max; + dc_rog_tunables->nv_temp_target = + dc_limits->nv_temp_target_max; + dc_rog_tunables->nv_tgp = dc_limits->nv_tgp_max; + + pr_debug("DC power limits initialized for %s\n", dmi_id->matches[0].substr); + } else { + pr_debug("No DC PPT limits defined\n"); + } + + return; + +err_nomem: + pr_err("Failed to allocate memory for tunables\n"); +} + static int __init asus_fw_init(void) { char *wmi_uid; @@ -835,6 +1119,9 @@ static int __init asus_fw_init(void) if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI)) return -ENODEV; + init_rog_tunables(); + + /* Must always be last step to ensure data is available */ return asus_fw_attr_add(); } @@ -857,6 +1144,9 @@ static void __exit asus_fw_exit(void) sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); kset_unregister(asus_armoury.fw_attr_kset); device_destroy(&firmware_attributes_class, MKDEV(0, 0)); + + kfree(asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC]); + kfree(asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC]); } module_init(asus_fw_init); diff --git a/drivers/platform/x86/asus-armoury.h b/drivers/platform/x86/asus-armoury.h index 3a2a674a1b55..548c66c590a6 100644 --- a/drivers/platform/x86/asus-armoury.h +++ b/drivers/platform/x86/asus-armoury.h @@ -8,6 +8,7 @@ #ifndef _ASUS_ARMOURY_H_ #define _ASUS_ARMOURY_H_ +#include #include #include #include @@ -197,4 +198,1297 @@ ssize_t armoury_attr_uint_show(struct kobject *kobj, struct kobj_attribute *attr .name = _fsname, .attrs = _attrname##_attrs \ } +#define ASUS_ATTR_GROUP_INT_VALUE_ONLY_RO(_attrname, _fsname, _wmi, _dispname) \ + ASUS_WMI_SHOW_INT(_attrname##_current_value, _wmi); \ + static struct kobj_attribute attr_##_attrname##_current_value = \ + __ASUS_ATTR_RO(_attrname, current_value); \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, int_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_type.attr, NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +/* + * ROG PPT attributes need a little different in setup as they + * require rog_tunables members. + */ + +#define __ROG_TUNABLE_SHOW(_prop, _attrname, _val) \ + static ssize_t _attrname##_##_prop##_show( \ + struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ + { \ + struct rog_tunables *tunables = get_current_tunables(); \ + \ + if (!tunables || !tunables->power_limits) \ + return -ENODEV; \ + \ + return sysfs_emit(buf, "%d\n", tunables->power_limits->_val); \ + } \ + static struct kobj_attribute attr_##_attrname##_##_prop = \ + __ASUS_ATTR_RO(_attrname, _prop) + +#define __ROG_TUNABLE_SHOW_DEFAULT(_attrname) \ + static ssize_t _attrname##_default_value_show( \ + struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ + { \ + struct rog_tunables *tunables = get_current_tunables(); \ + \ + if (!tunables || !tunables->power_limits) \ + return -ENODEV; \ + \ + return sysfs_emit( \ + buf, "%d\n", \ + tunables->power_limits->_attrname##_def ? \ + tunables->power_limits->_attrname##_def : \ + tunables->power_limits->_attrname##_max); \ + } \ + static struct kobj_attribute attr_##_attrname##_default_value = \ + __ASUS_ATTR_RO(_attrname, default_value) + +#define __ROG_TUNABLE_RW(_attr, _wmi) \ + static ssize_t _attr##_current_value_store( \ + struct kobject *kobj, struct kobj_attribute *attr, \ + const char *buf, size_t count) \ + { \ + struct rog_tunables *tunables = get_current_tunables(); \ + \ + if (!tunables || !tunables->power_limits) \ + return -ENODEV; \ + \ + if (tunables->power_limits->_attr##_min == \ + tunables->power_limits->_attr##_max) \ + return -EINVAL; \ + \ + return armoury_attr_uint_store(kobj, attr, buf, count, \ + tunables->power_limits->_attr##_min, \ + tunables->power_limits->_attr##_max, \ + &tunables->_attr, _wmi); \ + } \ + static ssize_t _attr##_current_value_show( \ + struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ + { \ + struct rog_tunables *tunables = get_current_tunables(); \ + \ + if (!tunables) \ + return -ENODEV; \ + \ + return sysfs_emit(buf, "%u\n", tunables->_attr); \ + } \ + static struct kobj_attribute attr_##_attr##_current_value = \ + __ASUS_ATTR_RW(_attr, current_value) + +#define ASUS_ATTR_GROUP_ROG_TUNABLE(_attrname, _fsname, _wmi, _dispname) \ + __ROG_TUNABLE_RW(_attrname, _wmi); \ + __ROG_TUNABLE_SHOW_DEFAULT(_attrname); \ + __ROG_TUNABLE_SHOW(min_value, _attrname, _attrname##_min); \ + __ROG_TUNABLE_SHOW(max_value, _attrname, _attrname##_max); \ + __ATTR_SHOW_FMT(scalar_increment, _attrname, "%d\n", 1); \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, int_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_default_value.attr, \ + &attr_##_attrname##_min_value.attr, \ + &attr_##_attrname##_max_value.attr, \ + &attr_##_attrname##_scalar_increment.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_type.attr, \ + NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +/* Default is always the maximum value unless *_def is specified */ +struct power_limits { + u8 ppt_pl1_spl_min; + u8 ppt_pl1_spl_def; + u8 ppt_pl1_spl_max; + u8 ppt_pl2_sppt_min; + u8 ppt_pl2_sppt_def; + u8 ppt_pl2_sppt_max; + u8 ppt_pl3_fppt_min; + u8 ppt_pl3_fppt_def; + u8 ppt_pl3_fppt_max; + u8 ppt_apu_sppt_min; + u8 ppt_apu_sppt_def; + u8 ppt_apu_sppt_max; + u8 ppt_platform_sppt_min; + u8 ppt_platform_sppt_def; + u8 ppt_platform_sppt_max; + /* Nvidia GPU specific, default is always max */ + u8 nv_dynamic_boost_def; // unused. exists for macro + u8 nv_dynamic_boost_min; + u8 nv_dynamic_boost_max; + u8 nv_temp_target_def; // unused. exists for macro + u8 nv_temp_target_min; + u8 nv_temp_target_max; + u8 nv_tgp_def; // unused. exists for macro + u8 nv_tgp_min; + u8 nv_tgp_max; +}; + +struct power_data { + const struct power_limits *ac_data; + const struct power_limits *dc_data; + bool requires_fan_curve; +}; + +/* + * For each available attribute there must be a min and a max. + * _def is not required and will be assumed to be default == max if missing. + */ +static const struct dmi_system_id power_limits[] = { + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA401W"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 75, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 30, + .ppt_pl2_sppt_min = 31, + .ppt_pl2_sppt_max = 44, + .ppt_pl3_fppt_min = 45, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA507N"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 45, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA507R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80 + }, + .dc_data = NULL, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA507X"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 85, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 45, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA507Z"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 105, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 15, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 85, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 60, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA607P"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 30, + .ppt_pl1_spl_def = 100, + .ppt_pl1_spl_max = 135, + .ppt_pl2_sppt_min = 30, + .ppt_pl2_sppt_def = 115, + .ppt_pl2_sppt_max = 135, + .ppt_pl3_fppt_min = 30, + .ppt_pl3_fppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 115, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_def = 45, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_def = 60, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 25, + .ppt_pl3_fppt_max = 80, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA608WI"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 90, + .ppt_pl1_spl_max = 90, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 90, + .ppt_pl2_sppt_max = 90, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 90, + .ppt_pl3_fppt_max = 90, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 115, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 45, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 65, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA617NS"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_apu_sppt_min = 15, + .ppt_apu_sppt_max = 80, + .ppt_platform_sppt_min = 30, + .ppt_platform_sppt_max = 120, + }, + .dc_data = &(struct power_limits) { + .ppt_apu_sppt_min = 25, + .ppt_apu_sppt_max = 35, + .ppt_platform_sppt_min = 45, + .ppt_platform_sppt_max = 100, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA617NT"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_apu_sppt_min = 15, + .ppt_apu_sppt_max = 80, + .ppt_platform_sppt_min = 30, + .ppt_platform_sppt_max = 115, + }, + .dc_data = &(struct power_limits) { + .ppt_apu_sppt_min = 15, + .ppt_apu_sppt_max = 45, + .ppt_platform_sppt_min = 30, + .ppt_platform_sppt_max = 50, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA617XS"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_apu_sppt_min = 15, + .ppt_apu_sppt_max = 80, + .ppt_platform_sppt_min = 30, + .ppt_platform_sppt_max = 120, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_apu_sppt_min = 25, + .ppt_apu_sppt_max = 35, + .ppt_platform_sppt_min = 45, + .ppt_platform_sppt_max = 100, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FX507VI"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 135, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 60, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FX507VV"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_def = 115, + .ppt_pl1_spl_max = 135, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 60, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FX507Z"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 90, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 15, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 60, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA401Q"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_max = 80, + }, + .dc_data = NULL, + }, + }, + { + .matches = { + // This model is full AMD. No Nvidia dGPU. + DMI_MATCH(DMI_BOARD_NAME, "GA402R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_apu_sppt_min = 15, + .ppt_apu_sppt_max = 80, + .ppt_platform_sppt_min = 30, + .ppt_platform_sppt_max = 115, + }, + .dc_data = &(struct power_limits) { + .ppt_apu_sppt_min = 25, + .ppt_apu_sppt_def = 30, + .ppt_apu_sppt_max = 45, + .ppt_platform_sppt_min = 40, + .ppt_platform_sppt_max = 60, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA402X"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 35, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_def = 65, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 35, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA403U"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 65, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 35, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA503R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 35, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 65, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 25, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 60, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA605W"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 85, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 31, + .ppt_pl2_sppt_max = 44, + .ppt_pl3_fppt_min = 45, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU603Z"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 60, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 40, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 40, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + } + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU604V"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 65, + .ppt_pl1_spl_max = 120, + .ppt_pl2_sppt_min = 65, + .ppt_pl2_sppt_max = 150, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 40, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 40, + .ppt_pl2_sppt_max = 60, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU605CW"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 45, + .ppt_pl1_spl_max = 85, + .ppt_pl2_sppt_min = 56, + .ppt_pl2_sppt_max = 110, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 80, + .nv_tgp_def = 90, + .nv_tgp_max = 110, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 85, + .ppt_pl2_sppt_min = 32, + .ppt_pl2_sppt_max = 110, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU605CX"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 45, + .ppt_pl1_spl_max = 85, + .ppt_pl2_sppt_min = 56, + .ppt_pl2_sppt_max = 110, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 7, + .nv_temp_target_max = 87, + .nv_tgp_min = 95, + .nv_tgp_def = 100, + .nv_tgp_max = 110, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 85, + .ppt_pl2_sppt_min = 32, + .ppt_pl2_sppt_max = 110, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU605M"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 90, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 38, + .ppt_pl2_sppt_max = 53, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GV301Q"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 65, + .ppt_pl2_sppt_max = 80, + }, + .dc_data = NULL, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GV301R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 54, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 35, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GV601R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 35, + .ppt_pl1_spl_max = 90, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 100, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 80, + .ppt_pl3_fppt_max = 125, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 28, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 60, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 80, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GV601V"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_def = 100, + .ppt_pl1_spl_max = 110, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 40, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 40, + .ppt_pl2_sppt_max = 60, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GX650P"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 110, + .ppt_pl1_spl_max = 130, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 125, + .ppt_pl2_sppt_max = 130, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 125, + .ppt_pl3_fppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 25, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 35, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 42, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G513I"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + /* Yes this laptop is very limited */ + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_max = 80, + }, + .dc_data = NULL, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G513QM"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + /* Yes this laptop is very limited */ + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 100, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_max = 190, + }, + .dc_data = NULL, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G513R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 35, + .ppt_pl1_spl_max = 90, + .ppt_pl2_sppt_min = 54, + .ppt_pl2_sppt_max = 100, + .ppt_pl3_fppt_min = 54, + .ppt_pl3_fppt_max = 125, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 50, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 50, + .ppt_pl3_fppt_min = 28, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G614J"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 140, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 175, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 55, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 70, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G634J"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 140, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 175, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 55, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 70, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G713PV"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 30, + .ppt_pl1_spl_def = 120, + .ppt_pl1_spl_max = 130, + .ppt_pl2_sppt_min = 65, + .ppt_pl2_sppt_def = 125, + .ppt_pl2_sppt_max = 130, + .ppt_pl3_fppt_min = 65, + .ppt_pl3_fppt_def = 125, + .ppt_pl3_fppt_max = 130, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 75, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G733C"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 170, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 175, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 35, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G733P"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 30, + .ppt_pl1_spl_def = 100, + .ppt_pl1_spl_max = 130, + .ppt_pl2_sppt_min = 65, + .ppt_pl2_sppt_def = 125, + .ppt_pl2_sppt_max = 130, + .ppt_pl3_fppt_min = 65, + .ppt_pl3_fppt_def = 125, + .ppt_pl3_fppt_max = 130, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 75, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G814J"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 140, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 140, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 55, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 70, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G834J"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 140, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 175, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 55, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 70, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "H7606W"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 85, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 31, + .ppt_pl2_sppt_max = 44, + .ppt_pl3_fppt_min = 45, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC71"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_max = 30, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_max = 43, + .ppt_pl3_fppt_min = 15, + .ppt_pl3_fppt_max = 53, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_def = 15, + .ppt_pl1_spl_max = 25, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_def = 20, + .ppt_pl2_sppt_max = 30, + .ppt_pl3_fppt_min = 15, + .ppt_pl3_fppt_def = 25, + .ppt_pl3_fppt_max = 35, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC72"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_max = 30, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_max = 43, + .ppt_pl3_fppt_min = 15, + .ppt_pl3_fppt_max = 53, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_def = 17, + .ppt_pl1_spl_max = 25, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_def = 24, + .ppt_pl2_sppt_max = 30, + .ppt_pl3_fppt_min = 15, + .ppt_pl3_fppt_def = 30, + .ppt_pl3_fppt_max = 35, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC73XA"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 14, + .ppt_pl2_sppt_max = 45, + .ppt_pl3_fppt_min = 19, + .ppt_pl3_fppt_max = 55, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_def = 17, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 13, + .ppt_pl2_sppt_def = 21, + .ppt_pl2_sppt_max = 45, + .ppt_pl3_fppt_min = 19, + .ppt_pl3_fppt_def = 26, + .ppt_pl3_fppt_max = 55, + }, + }, + }, + {} +}; + #endif /* _ASUS_ARMOURY_H_ */ diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index e7c95e9d29db..419491d4abca 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -139,6 +139,9 @@ #define ASUS_WMI_DEVID_APU_MEM 0x000600C1 +#define ASUS_WMI_DEVID_DGPU_BASE_TGP 0x00120099 +#define ASUS_WMI_DEVID_DGPU_SET_TGP 0x00120098 + /* gpu mux switch, 0 = dGPU, 1 = Optimus */ #define ASUS_WMI_DEVID_GPU_MUX 0x00090016 #define ASUS_WMI_DEVID_GPU_MUX_VIVO 0x00090026 -- cgit v1.2.3 From 32e3fee88a4ac183541b478f5bc94084ea76436c Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Tue, 11 Nov 2025 14:11:24 +0100 Subject: platform/x86: wmi: Remove extern keyword from prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The external function definitions do not need the "extern" keyword. Remove it to silence the associated checkpatch warnings. Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20251111131125.3379-4-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 10751c8e5e6a..665ea7dc8a92 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -36,13 +36,10 @@ struct wmi_device { */ #define to_wmi_device(device) container_of_const(device, struct wmi_device, dev) -extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev, - u8 instance, u32 method_id, - const struct acpi_buffer *in, - struct acpi_buffer *out); +acpi_status wmidev_evaluate_method(struct wmi_device *wdev, u8 instance, u32 method_id, + const struct acpi_buffer *in, struct acpi_buffer *out); -extern union acpi_object *wmidev_block_query(struct wmi_device *wdev, - u8 instance); +union acpi_object *wmidev_block_query(struct wmi_device *wdev, u8 instance); acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in); @@ -81,9 +78,9 @@ struct wmi_driver { */ #define to_wmi_driver(drv) container_of_const(drv, struct wmi_driver, driver) -extern int __must_check __wmi_driver_register(struct wmi_driver *driver, - struct module *owner); -extern void wmi_driver_unregister(struct wmi_driver *driver); +int __must_check __wmi_driver_register(struct wmi_driver *driver, struct module *owner); + +void wmi_driver_unregister(struct wmi_driver *driver); /** * wmi_driver_register() - Helper macro to register a WMI driver -- cgit v1.2.3 From 80adaccf0e1c8c8fff44be2d959f6dba80af0491 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 18 Nov 2025 13:52:13 +0300 Subject: rseq: Delete duplicate if statement in rseq_virt_userspace_exit() This if statement is indented weirdly. It's a duplicate and doesn't affect runtime (beyond wasting a little time). Delete it. Signed-off-by: Dan Carpenter Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/aRxP3YcwscrP1BU_@stanley.mountain --- include/linux/rseq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index b5e4803c4ebe..bf8a6bf315f3 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -126,7 +126,6 @@ static inline void rseq_force_update(void) */ static inline void rseq_virt_userspace_exit(void) { - if (current->rseq.event.sched_switch) /* * The generic optimization for deferring RSEQ updates until the next * exit relies on having a dedicated TIF_RSEQ. -- cgit v1.2.3 From 5bebe8de19264946d398ead4e6c20c229454a552 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Nov 2025 08:21:27 -0800 Subject: mm/huge_memory: Fix initialization of huge zero folio The recent fix to properly initialize the tags of the huge zero folio had an unfortunate not-so-subtle side effect: it caused the actual *contents* of the huge zero folio to not be initialized at all when the hardware didn't support the memory tagging. The reason was the unfortunate semantics of tag_clear_highpage(): on hardware that didn't do the tagging, it would silently just not do anything at all. And since this is done only on arm64 with MTE support, that basically meant most hardware. It wasn't necessarily immediately obvious since the huge zero page isn't necessarily very heavily used - or because it might already be zero because all-zeroes is the most common pattern. But it ends up causing random odd user space failures when you do hit it. The unfortunate semantics have been around for a while, but became a real bug only when we started actively using __GFP_ZEROTAGS in the generic get_huge_zero_folio() function - before that, it had only ever been used in code that checked that the hardware supported it. Fix this by simply changing the semantics of tag_clear_highpage() to return whether it actually successfully did something or not. While at it, also make it initialize multiple pages in one go, since that's actually what the only caller wants it to do and it simplifies the whole logic. Fixes: adfb6609c680 ("mm/huge_memory: initialise the tags of the huge zero folio") Link: https://lore.kernel.org/all/20251117082023.90176-1-00107082@163.com/ Reviewed-by: David Hildenbrand (Red Hat) Reported-and-tested-by: David Wang <00107082@163.com> Reported-and-tested-by: Carlos Llamas Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/page.h | 4 ++-- arch/arm64/mm/fault.c | 21 +++++++++++---------- include/linux/highmem.h | 6 ++++-- mm/page_alloc.c | 9 ++------- 4 files changed, 19 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 2312e6ee595f..258cca4b4873 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -33,8 +33,8 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr); #define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio -void tag_clear_highpage(struct page *to); -#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE +bool tag_clear_highpages(struct page *to, int numpages); +#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 125dfa6c613b..a193b6a5d1e6 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -967,20 +967,21 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, return vma_alloc_folio(flags, 0, vma, vaddr); } -void tag_clear_highpage(struct page *page) +bool tag_clear_highpages(struct page *page, int numpages) { /* * Check if MTE is supported and fall back to clear_highpage(). * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and - * post_alloc_hook() will invoke tag_clear_highpage(). + * post_alloc_hook() will invoke tag_clear_highpages(). */ - if (!system_supports_mte()) { - clear_highpage(page); - return; - } + if (!system_supports_mte()) + return false; - /* Newly allocated page, shouldn't have been tagged yet */ - WARN_ON_ONCE(!try_page_mte_tagging(page)); - mte_zero_clear_page_tags(page_address(page)); - set_page_mte_tagged(page); + /* Newly allocated pages, shouldn't have been tagged yet */ + for (int i = 0; i < numpages; i++, page++) { + WARN_ON_ONCE(!try_page_mte_tagging(page)); + mte_zero_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } + return true; } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 105cc4c00cc3..abc20f9810fd 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -249,10 +249,12 @@ static inline void clear_highpage_kasan_tagged(struct page *page) kunmap_local(kaddr); } -#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE +#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -static inline void tag_clear_highpage(struct page *page) +/* Return false to let people know we did not initialize the pages */ +static inline bool tag_clear_highpages(struct page *page, int numpages) { + return false; } #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 600d9e981c23..ed82ee55e66a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1822,14 +1822,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * If memory tags should be zeroed * (which happens only when memory should be initialized as well). */ - if (zero_tags) { - /* Initialize both memory and memory tags. */ - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); + if (zero_tags) + init = !tag_clear_highpages(page, 1 << order); - /* Take note that memory was initialized by the loop above. */ - init = false; - } if (!should_skip_kasan_unpoison(gfp_flags) && kasan_unpoison_pages(page, order, init)) { /* Take note that memory was initialized by KASAN. */ -- cgit v1.2.3 From 8bffbfdc01dff26f17f8b382266e71d48e63c5e9 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 22 Oct 2025 15:51:32 +0200 Subject: reset: remove legacy reset lookup code There are no more users of this code. Let's remove the exported symbols and the implementation from reset core. Reviewed-by: Philipp Zabel Signed-off-by: Bartosz Golaszewski [p.zabel@pengutronix.de: folded in 8e6ec20e-8965-4b42-99fc-0462269ff2f1@paulmck-laptop] Signed-off-by: Philipp Zabel --- drivers/reset/core.c | 124 +-------------------------------------- include/linux/reset-controller.h | 33 ----------- 2 files changed, 3 insertions(+), 154 deletions(-) (limited to 'include/linux') diff --git a/drivers/reset/core.c b/drivers/reset/core.c index 22f67fc77ae5..20cd50804ba1 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -25,9 +25,6 @@ static DEFINE_MUTEX(reset_list_mutex); static LIST_HEAD(reset_controller_list); -static DEFINE_MUTEX(reset_lookup_mutex); -static LIST_HEAD(reset_lookup_list); - /* Protects reset_gpio_lookup_list */ static DEFINE_MUTEX(reset_gpio_lookup_mutex); static LIST_HEAD(reset_gpio_lookup_list); @@ -190,33 +187,6 @@ int devm_reset_controller_register(struct device *dev, } EXPORT_SYMBOL_GPL(devm_reset_controller_register); -/** - * reset_controller_add_lookup - register a set of lookup entries - * @lookup: array of reset lookup entries - * @num_entries: number of entries in the lookup array - */ -void reset_controller_add_lookup(struct reset_control_lookup *lookup, - unsigned int num_entries) -{ - struct reset_control_lookup *entry; - unsigned int i; - - mutex_lock(&reset_lookup_mutex); - for (i = 0; i < num_entries; i++) { - entry = &lookup[i]; - - if (!entry->dev_id || !entry->provider) { - pr_warn("%s(): reset lookup entry badly specified, skipping\n", - __func__); - continue; - } - - list_add_tail(&entry->list, &reset_lookup_list); - } - mutex_unlock(&reset_lookup_mutex); -} -EXPORT_SYMBOL_GPL(reset_controller_add_lookup); - static inline struct reset_control_array * rstc_to_array(struct reset_control *rstc) { return container_of(rstc, struct reset_control_array, base); @@ -1081,75 +1051,12 @@ out_put: } EXPORT_SYMBOL_GPL(__of_reset_control_get); -static struct reset_controller_dev * -__reset_controller_by_name(const char *name) -{ - struct reset_controller_dev *rcdev; - - lockdep_assert_held(&reset_list_mutex); - - list_for_each_entry(rcdev, &reset_controller_list, list) { - if (!rcdev->dev) - continue; - - if (!strcmp(name, dev_name(rcdev->dev))) - return rcdev; - } - - return NULL; -} - -static struct reset_control * -__reset_control_get_from_lookup(struct device *dev, const char *con_id, - enum reset_control_flags flags) -{ - bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; - const struct reset_control_lookup *lookup; - struct reset_controller_dev *rcdev; - const char *dev_id = dev_name(dev); - struct reset_control *rstc = NULL; - - mutex_lock(&reset_lookup_mutex); - - list_for_each_entry(lookup, &reset_lookup_list, list) { - if (strcmp(lookup->dev_id, dev_id)) - continue; - - if ((!con_id && !lookup->con_id) || - ((con_id && lookup->con_id) && - !strcmp(con_id, lookup->con_id))) { - mutex_lock(&reset_list_mutex); - rcdev = __reset_controller_by_name(lookup->provider); - if (!rcdev) { - mutex_unlock(&reset_list_mutex); - mutex_unlock(&reset_lookup_mutex); - /* Reset provider may not be ready yet. */ - return ERR_PTR(-EPROBE_DEFER); - } - - flags &= ~RESET_CONTROL_FLAGS_BIT_OPTIONAL; - - rstc = __reset_control_get_internal(rcdev, - lookup->index, - flags); - mutex_unlock(&reset_list_mutex); - break; - } - } - - mutex_unlock(&reset_lookup_mutex); - - if (!rstc) - return optional ? NULL : ERR_PTR(-ENOENT); - - return rstc; -} - struct reset_control *__reset_control_get(struct device *dev, const char *id, int index, enum reset_control_flags flags) { bool shared = flags & RESET_CONTROL_FLAGS_BIT_SHARED; bool acquired = flags & RESET_CONTROL_FLAGS_BIT_ACQUIRED; + bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; if (WARN_ON(shared && acquired)) return ERR_PTR(-EINVAL); @@ -1157,7 +1064,7 @@ struct reset_control *__reset_control_get(struct device *dev, const char *id, if (dev->of_node) return __of_reset_control_get(dev->of_node, id, index, flags); - return __reset_control_get_from_lookup(dev, id, flags); + return optional ? NULL : ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(__reset_control_get); @@ -1492,31 +1399,6 @@ devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags) } EXPORT_SYMBOL_GPL(devm_reset_control_array_get); -static int reset_control_get_count_from_lookup(struct device *dev) -{ - const struct reset_control_lookup *lookup; - const char *dev_id; - int count = 0; - - if (!dev) - return -EINVAL; - - dev_id = dev_name(dev); - mutex_lock(&reset_lookup_mutex); - - list_for_each_entry(lookup, &reset_lookup_list, list) { - if (!strcmp(lookup->dev_id, dev_id)) - count++; - } - - mutex_unlock(&reset_lookup_mutex); - - if (count == 0) - count = -ENOENT; - - return count; -} - /** * reset_control_get_count - Count number of resets available with a device * @@ -1530,6 +1412,6 @@ int reset_control_get_count(struct device *dev) if (dev->of_node) return of_reset_control_get_count(dev->of_node); - return reset_control_get_count_from_lookup(dev); + return -ENOENT; } EXPORT_SYMBOL_GPL(reset_control_get_count); diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 357df16ede32..46514cb1b9e0 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -26,31 +26,6 @@ struct module; struct device_node; struct of_phandle_args; -/** - * struct reset_control_lookup - represents a single lookup entry - * - * @list: internal list of all reset lookup entries - * @provider: name of the reset controller device controlling this reset line - * @index: ID of the reset controller in the reset controller device - * @dev_id: name of the device associated with this reset line - * @con_id: name of the reset line (can be NULL) - */ -struct reset_control_lookup { - struct list_head list; - const char *provider; - unsigned int index; - const char *dev_id; - const char *con_id; -}; - -#define RESET_LOOKUP(_provider, _index, _dev_id, _con_id) \ - { \ - .provider = _provider, \ - .index = _index, \ - .dev_id = _dev_id, \ - .con_id = _con_id, \ - } - /** * struct reset_controller_dev - reset controller entity that might * provide multiple reset controls @@ -90,9 +65,6 @@ void reset_controller_unregister(struct reset_controller_dev *rcdev); struct device; int devm_reset_controller_register(struct device *dev, struct reset_controller_dev *rcdev); - -void reset_controller_add_lookup(struct reset_control_lookup *lookup, - unsigned int num_entries); #else static inline int reset_controller_register(struct reset_controller_dev *rcdev) { @@ -108,11 +80,6 @@ static inline int devm_reset_controller_register(struct device *dev, { return 0; } - -static inline void reset_controller_add_lookup(struct reset_control_lookup *lookup, - unsigned int num_entries) -{ -} #endif #endif -- cgit v1.2.3 From f3d8b64ee46c9b4b0b82b1a4642027728bac95b8 Mon Sep 17 00:00:00 2001 From: Encrow Thorne Date: Mon, 10 Nov 2025 14:10:37 +0800 Subject: reset: fix BIT macro reference RESET_CONTROL_FLAGS_BIT_* macros use BIT(), but reset.h does not include bits.h. This causes compilation errors when including reset.h standalone. Include bits.h to make reset.h self-contained. Suggested-by: Troy Mitchell Reviewed-by: Troy Mitchell Reviewed-by: Philipp Zabel Signed-off-by: Encrow Thorne Signed-off-by: Philipp Zabel --- include/linux/reset.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/reset.h b/include/linux/reset.h index 840d75d172f6..44f9e3415f92 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -2,6 +2,7 @@ #ifndef _LINUX_RESET_H_ #define _LINUX_RESET_H_ +#include #include #include #include -- cgit v1.2.3 From 4edf654be5471659e3260be0a557eaa2ece668ab Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Wed, 12 Nov 2025 16:27:06 +0000 Subject: phy: add new phy_notify_state() api Add a new phy_notify_state() api that notifies and configures a phy for a given state transition. This is intended to be used by phy drivers which need to do some runtime configuration of parameters that can't be handled by phy_calibrate() or phy_power_{on|off}(). The first usage of this API is in the Samsung UFS phy that needs to issue some register writes when entering and exiting the hibernate link state. Signed-off-by: Peter Griffin Reviewed-by: Neil Armstrong Link: https://patch.msgid.link/20251112-phy-notify-pmstate-v5-1-39df622d8fcb@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 25 +++++++++++++++++++++++++ include/linux/phy/phy.h | 19 +++++++++++++++++++ 2 files changed, 44 insertions(+) (limited to 'include/linux') diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index 04a5a34e7a95..60be8af984bf 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -520,6 +520,31 @@ int phy_notify_disconnect(struct phy *phy, int port) } EXPORT_SYMBOL_GPL(phy_notify_disconnect); +/** + * phy_notify_state() - phy state notification + * @phy: the PHY returned by phy_get() + * @state: the PHY state + * + * Notify the PHY of a state transition. Used to notify and + * configure the PHY accordingly. + * + * Returns: %0 if successful, a negative error code otherwise + */ +int phy_notify_state(struct phy *phy, union phy_notify state) +{ + int ret; + + if (!phy || !phy->ops->notify_phystate) + return 0; + + mutex_lock(&phy->mutex); + ret = phy->ops->notify_phystate(phy, state); + mutex_unlock(&phy->mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(phy_notify_state); + /** * phy_configure() - Changes the phy parameters * @phy: the phy returned by phy_get() diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index 13add0c2c407..2af0d01ebb39 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -53,6 +53,15 @@ enum phy_media { PHY_MEDIA_DAC, }; +enum phy_ufs_state { + PHY_UFS_HIBERN8_ENTER, + PHY_UFS_HIBERN8_EXIT, +}; + +union phy_notify { + enum phy_ufs_state ufs_state; +}; + /** * union phy_configure_opts - Opaque generic phy configuration * @@ -83,6 +92,7 @@ union phy_configure_opts { * @set_speed: set the speed of the phy (optional) * @reset: resetting the phy * @calibrate: calibrate the phy + * @notify_phystate: notify and configure the phy for a particular state * @release: ops to be performed while the consumer relinquishes the PHY * @owner: the module owner containing the ops */ @@ -132,6 +142,7 @@ struct phy_ops { int (*connect)(struct phy *phy, int port); int (*disconnect)(struct phy *phy, int port); + int (*notify_phystate)(struct phy *phy, union phy_notify state); void (*release)(struct phy *phy); struct module *owner; }; @@ -255,6 +266,7 @@ int phy_reset(struct phy *phy); int phy_calibrate(struct phy *phy); int phy_notify_connect(struct phy *phy, int port); int phy_notify_disconnect(struct phy *phy, int port); +int phy_notify_state(struct phy *phy, union phy_notify state); static inline int phy_get_bus_width(struct phy *phy) { return phy->attrs.bus_width; @@ -412,6 +424,13 @@ static inline int phy_notify_disconnect(struct phy *phy, int index) return -ENOSYS; } +static inline int phy_notify_state(struct phy *phy, union phy_notify state) +{ + if (!phy) + return 0; + return -ENOSYS; +} + static inline int phy_configure(struct phy *phy, union phy_configure_opts *opts) { -- cgit v1.2.3 From 01ba82702957225218c54c06ad2c2d468b83f510 Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Chundru Date: Sat, 1 Nov 2025 09:29:33 +0530 Subject: PCI: Add .assert_perst() to control PCIe PERST# Controller driver probes first, enables link training and scans the bus. When the PCI bridge is found, its child DT nodes will be scanned and pwrctrl devices will be created if needed. By the time pwrctrl driver probe gets called, link training is already enabled by controller driver. Certain devices like TC9563, which uses the PCI pwrctl framework, need to configure the device before the PCIe link is up. As the controller driver already enables link training as part of its probe, the moment device is powered on, controller and device participate in link training and link can come up immediately and may not have time to configure the device. So we need to stop the link training by using assert_perst() by asserting PERST# and de-asserting PERST# after device is configured. Signed-off-by: Krishna Chaitanya Chundru Signed-off-by: Bjorn Helgaas Acked-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20251101-tc9563-v9-2-de3429f7787a@oss.qualcomm.com --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..ed5dac663e96 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -829,6 +829,7 @@ struct pci_ops { void __iomem *(*map_bus)(struct pci_bus *bus, unsigned int devfn, int where); int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val); int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val); + int (*assert_perst)(struct pci_bus *bus, bool assert); }; /* -- cgit v1.2.3 From d9d0be59be2580f2c5e4b7217aafb980e8c371cf Mon Sep 17 00:00:00 2001 From: Martijn de Gouw Date: Mon, 17 Nov 2025 21:22:14 +0100 Subject: regulator: pca9450: Add support for setting debounce settings Make the different debounce timers configurable from the devicetree. Depending on the board design, these have to be set different than the default register values. Signed-off-by: Martijn de Gouw Link: https://patch.msgid.link/20251117202215.1936139-2-martijn.de.gouw@prodrive-technologies.com Signed-off-by: Mark Brown --- drivers/regulator/pca9450-regulator.c | 158 ++++++++++++++++++++++++++++++---- include/linux/regulator/pca9450.h | 32 +++++++ 2 files changed, 171 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/regulator/pca9450-regulator.c b/drivers/regulator/pca9450-regulator.c index 247f12df8974..93154c9c98dd 100644 --- a/drivers/regulator/pca9450-regulator.c +++ b/drivers/regulator/pca9450-regulator.c @@ -1147,6 +1147,143 @@ static int pca9450_i2c_restart_handler(struct sys_off_data *data) return 0; } +static int pca9450_of_init(struct pca9450 *pca9450) +{ + struct i2c_client *i2c = container_of(pca9450->dev, struct i2c_client, dev); + int ret; + unsigned int val; + unsigned int reset_ctrl; + unsigned int rstb_deb_ctrl; + unsigned int t_on_deb, t_off_deb; + unsigned int t_on_step, t_off_step; + unsigned int t_restart; + + if (of_property_read_bool(i2c->dev.of_node, "nxp,wdog_b-warm-reset")) + reset_ctrl = WDOG_B_CFG_WARM; + else + reset_ctrl = WDOG_B_CFG_COLD_LDO12; + + /* Set reset behavior on assertion of WDOG_B signal */ + ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_RESET_CTRL, + WDOG_B_CFG_MASK, reset_ctrl); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Failed to set WDOG_B reset behavior\n"); + + ret = of_property_read_u32(i2c->dev.of_node, "npx,pmic-rst-b-debounce-ms", &val); + if (ret == -EINVAL) + rstb_deb_ctrl = T_PMIC_RST_DEB_50MS; + else if (ret) + return ret; + else { + switch (val) { + case 10: rstb_deb_ctrl = T_PMIC_RST_DEB_10MS; break; + case 50: rstb_deb_ctrl = T_PMIC_RST_DEB_50MS; break; + case 100: rstb_deb_ctrl = T_PMIC_RST_DEB_100MS; break; + case 500: rstb_deb_ctrl = T_PMIC_RST_DEB_500MS; break; + case 1000: rstb_deb_ctrl = T_PMIC_RST_DEB_1S; break; + case 2000: rstb_deb_ctrl = T_PMIC_RST_DEB_2S; break; + case 4000: rstb_deb_ctrl = T_PMIC_RST_DEB_4S; break; + case 8000: rstb_deb_ctrl = T_PMIC_RST_DEB_8S; break; + default: return -EINVAL; + } + } + ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_RESET_CTRL, + T_PMIC_RST_DEB_MASK, rstb_deb_ctrl); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Failed to set PMIC_RST_B debounce time\n"); + + ret = of_property_read_u32(i2c->dev.of_node, "nxp,pmic-on-req-on-debounce-us", &val); + if (ret == -EINVAL) + t_on_deb = T_ON_DEB_20MS; + else if (ret) + return ret; + else { + switch (val) { + case 120: t_on_deb = T_ON_DEB_120US; break; + case 20000: t_on_deb = T_ON_DEB_20MS; break; + case 100000: t_on_deb = T_ON_DEB_100MS; break; + case 750000: t_on_deb = T_ON_DEB_750MS; break; + default: return -EINVAL; + } + } + + ret = of_property_read_u32(i2c->dev.of_node, "nxp,pmic-on-req-off-debounce-us", &val); + if (ret == -EINVAL) + t_off_deb = T_OFF_DEB_120US; + else if (ret) + return ret; + else { + switch (val) { + case 120: t_off_deb = T_OFF_DEB_120US; break; + case 2000: t_off_deb = T_OFF_DEB_2MS; break; + default: return -EINVAL; + } + } + + ret = of_property_read_u32(i2c->dev.of_node, "nxp,power-on-step-ms", &val); + if (ret == -EINVAL) + t_on_step = T_ON_STEP_2MS; + else if (ret) + return ret; + else { + switch (val) { + case 1: t_on_step = T_ON_STEP_1MS; break; + case 2: t_on_step = T_ON_STEP_2MS; break; + case 4: t_on_step = T_ON_STEP_4MS; break; + case 8: t_on_step = T_ON_STEP_8MS; break; + default: return -EINVAL; + } + } + + ret = of_property_read_u32(i2c->dev.of_node, "nxp,power-down-step-ms", &val); + if (ret == -EINVAL) + t_off_step = T_OFF_STEP_8MS; + else if (ret) + return ret; + else { + switch (val) { + case 2: t_off_step = T_OFF_STEP_2MS; break; + case 4: t_off_step = T_OFF_STEP_4MS; break; + case 8: t_off_step = T_OFF_STEP_8MS; break; + case 16: t_off_step = T_OFF_STEP_16MS; break; + default: return -EINVAL; + } + } + + ret = of_property_read_u32(i2c->dev.of_node, "nxp,restart-ms", &val); + if (ret == -EINVAL) + t_restart = T_RESTART_250MS; + else if (ret) + return ret; + else { + switch (val) { + case 250: t_restart = T_RESTART_250MS; break; + case 500: t_restart = T_RESTART_500MS; break; + default: return -EINVAL; + } + } + + ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_PWRCTRL, + T_ON_DEB_MASK | T_OFF_DEB_MASK | T_ON_STEP_MASK | + T_OFF_STEP_MASK | T_RESTART_MASK, + t_on_deb | t_off_deb | t_on_step | + t_off_step | t_restart); + if (ret) + return dev_err_probe(&i2c->dev, ret, + "Failed to set PWR_CTRL debounce configuration\n"); + + if (of_property_read_bool(i2c->dev.of_node, "nxp,i2c-lt-enable")) { + /* Enable I2C Level Translator */ + ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_CONFIG2, + I2C_LT_MASK, I2C_LT_ON_STANDBY_RUN); + if (ret) + return dev_err_probe(&i2c->dev, ret, + "Failed to enable I2C level translator\n"); + } + + return 0; +} + static int pca9450_i2c_probe(struct i2c_client *i2c) { enum pca9450_chip_type type = (unsigned int)(uintptr_t) @@ -1156,7 +1293,6 @@ static int pca9450_i2c_probe(struct i2c_client *i2c) struct regulator_dev *ldo5; struct pca9450 *pca9450; unsigned int device_id, i; - unsigned int reset_ctrl; int ret; pca9450 = devm_kzalloc(&i2c->dev, sizeof(struct pca9450), GFP_KERNEL); @@ -1254,25 +1390,9 @@ static int pca9450_i2c_probe(struct i2c_client *i2c) if (ret) return dev_err_probe(&i2c->dev, ret, "Failed to clear PRESET_EN bit\n"); - if (of_property_read_bool(i2c->dev.of_node, "nxp,wdog_b-warm-reset")) - reset_ctrl = WDOG_B_CFG_WARM; - else - reset_ctrl = WDOG_B_CFG_COLD_LDO12; - - /* Set reset behavior on assertion of WDOG_B signal */ - ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_RESET_CTRL, - WDOG_B_CFG_MASK, reset_ctrl); + ret = pca9450_of_init(pca9450); if (ret) - return dev_err_probe(&i2c->dev, ret, "Failed to set WDOG_B reset behavior\n"); - - if (of_property_read_bool(i2c->dev.of_node, "nxp,i2c-lt-enable")) { - /* Enable I2C Level Translator */ - ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_CONFIG2, - I2C_LT_MASK, I2C_LT_ON_STANDBY_RUN); - if (ret) - return dev_err_probe(&i2c->dev, ret, - "Failed to enable I2C level translator\n"); - } + return dev_err_probe(&i2c->dev, ret, "Unable to parse OF data\n"); /* * For LDO5 we need to be able to check the status of the SD_VSEL input in diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index 85b4fecc10d8..0df8b3c48082 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -223,12 +223,44 @@ enum { #define IRQ_THERM_105 0x02 #define IRQ_THERM_125 0x01 +/* PCA9450_REG_PWRCTRL bits */ +#define T_ON_DEB_MASK 0xC0 +#define T_ON_DEB_120US (0 << 6) +#define T_ON_DEB_20MS (1 << 6) +#define T_ON_DEB_100MS (2 << 6) +#define T_ON_DEB_750MS (3 << 6) +#define T_OFF_DEB_MASK 0x20 +#define T_OFF_DEB_120US (0 << 5) +#define T_OFF_DEB_2MS (1 << 5) +#define T_ON_STEP_MASK 0x18 +#define T_ON_STEP_1MS (0 << 3) +#define T_ON_STEP_2MS (1 << 3) +#define T_ON_STEP_4MS (2 << 3) +#define T_ON_STEP_8MS (3 << 3) +#define T_OFF_STEP_MASK 0x06 +#define T_OFF_STEP_2MS (0 << 1) +#define T_OFF_STEP_4MS (1 << 1) +#define T_OFF_STEP_8MS (2 << 1) +#define T_OFF_STEP_16MS (3 << 1) +#define T_RESTART_MASK 0x01 +#define T_RESTART_250MS 0 +#define T_RESTART_500MS 1 + /* PCA9450_REG_RESET_CTRL bits */ #define WDOG_B_CFG_MASK 0xC0 #define WDOG_B_CFG_NONE 0x00 #define WDOG_B_CFG_WARM 0x40 #define WDOG_B_CFG_COLD_LDO12 0x80 #define WDOG_B_CFG_COLD 0xC0 +#define T_PMIC_RST_DEB_MASK 0x07 +#define T_PMIC_RST_DEB_10MS 0x00 +#define T_PMIC_RST_DEB_50MS 0x01 +#define T_PMIC_RST_DEB_100MS 0x02 +#define T_PMIC_RST_DEB_500MS 0x03 +#define T_PMIC_RST_DEB_1S 0x04 +#define T_PMIC_RST_DEB_2S 0x05 +#define T_PMIC_RST_DEB_4S 0x06 +#define T_PMIC_RST_DEB_8S 0x07 /* PCA9450_REG_CONFIG2 bits */ #define I2C_LT_MASK 0x03 -- cgit v1.2.3 From d85b56af22f371409cbf667bab26f938e6528d2e Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 15 Oct 2025 17:56:30 +0200 Subject: efi: Fix trailing whitespace in header file Resolve an issue with the coding style. Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 0b9eb3d2ff97..60e994096e20 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -290,7 +290,7 @@ typedef efi_status_t efi_get_variable_t (efi_char16_t *name, efi_guid_t *vendor, unsigned long *data_size, void *data); typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor); -typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, +typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data); typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count); -- cgit v1.2.3 From 17029cdd8f9d0182a6499e0b7bfc6391e8463091 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 15 Oct 2025 17:56:33 +0200 Subject: efi/libstub: gop: Add support for reading EDID Add support for EFI_EDID_DISCOVERED_PROTOCOL and EFI_EDID_ACTIVE_PROTOCOL as defined in UEFI 2.8, sec 12.9. Define GUIDs and data structures in the rsp header files. In the GOP setup function, read the EDID of the primary GOP device. First try EFI_EDID_ACTIVE_PROTOCOL, which supports user-specified EDID data. Or else try EFI_EDID_DISCOVERED_PROTOCOL, which returns the display device's native EDID. If no EDID could be retrieved, clear the storage. Rename efi_setup_gop() to efi_setup_graphics() to reflect the changes Let callers pass an optional instance of struct edid_data, if they are interested. While screen_info and edid_info come from the same device handle, they should be considered indendent data. The former refers to the graphics mode, the latter refers to the display device. GOP devices might not provide both. Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/efi-stub.c | 2 +- drivers/firmware/efi/libstub/efistub.h | 31 +++++++++++++++++++++++++++- drivers/firmware/efi/libstub/gop.c | 36 ++++++++++++++++++++++++++++++++- drivers/firmware/efi/libstub/x86-stub.c | 2 +- include/linux/efi.h | 2 ++ 5 files changed, 69 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 874f63b4a383..9cb814c5ba1b 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -56,7 +56,7 @@ static struct screen_info *setup_graphics(void) { struct screen_info *si, tmp = {}; - if (efi_setup_gop(&tmp) != EFI_SUCCESS) + if (efi_setup_graphics(&tmp, NULL) != EFI_SUCCESS) return NULL; si = alloc_screen_info(); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index f5ba032863a9..b2fb0c3fa721 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -34,6 +34,9 @@ #define EFI_ALLOC_LIMIT ULONG_MAX #endif +struct edid_info; +struct screen_info; + extern bool efi_no5lvl; extern bool efi_nochunk; extern bool efi_nokaslr; @@ -578,6 +581,32 @@ union efi_graphics_output_protocol { } mixed_mode; }; +typedef union efi_edid_discovered_protocol efi_edid_discovered_protocol_t; + +union efi_edid_discovered_protocol { + struct { + u32 size_of_edid; + u8 *edid; + }; + struct { + u32 size_of_edid; + u32 edid; + } mixed_mode; +}; + +typedef union efi_edid_active_protocol efi_edid_active_protocol_t; + +union efi_edid_active_protocol { + struct { + u32 size_of_edid; + u8 *edid; + }; + struct { + u32 size_of_edid; + u32 edid; + } mixed_mode; +}; + typedef union { struct { u32 revision; @@ -1085,7 +1114,7 @@ efi_status_t efi_parse_options(char const *cmdline); void efi_parse_option_graphics(char *option); -efi_status_t efi_setup_gop(struct screen_info *si); +efi_status_t efi_setup_graphics(struct screen_info *si, struct edid_info *edid); efi_status_t handle_cmdline_files(efi_loaded_image_t *image, const efi_char16_t *optstr, diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c index 02459ef0f18c..72d74436a7a4 100644 --- a/drivers/firmware/efi/libstub/gop.c +++ b/drivers/firmware/efi/libstub/gop.c @@ -12,6 +12,7 @@ #include #include #include +#include