From 1da33858af6250184d2ef907494d698af03283de Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 31 Jul 2025 21:38:18 +0100 Subject: regmap: irq: Free the regmap-irq mutex We do not currently free the mutex allocated by regmap-irq, do so. Tested-by: Russell King (Oracle) Reviewed-by: Russell King (Oracle) Signed-off-by: Mark Brown Link: https://patch.msgid.link/20250731-regmap-irq-nesting-v1-1-98b4d1bf20f0@kernel.org Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-irq.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c index d1585f073776..4aac12d38215 100644 --- a/drivers/base/regmap/regmap-irq.c +++ b/drivers/base/regmap/regmap-irq.c @@ -816,7 +816,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, d->mask_buf[i], chip->irq_drv_data); if (ret) - goto err_alloc; + goto err_mutex; } if (chip->mask_base && !chip->handle_mask_sync) { @@ -827,7 +827,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, if (ret) { dev_err(map->dev, "Failed to set masks in 0x%x: %d\n", reg, ret); - goto err_alloc; + goto err_mutex; } } @@ -838,7 +838,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, if (ret) { dev_err(map->dev, "Failed to set masks in 0x%x: %d\n", reg, ret); - goto err_alloc; + goto err_mutex; } } @@ -855,7 +855,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, if (ret != 0) { dev_err(map->dev, "Failed to read IRQ status: %d\n", ret); - goto err_alloc; + goto err_mutex; } } @@ -879,7 +879,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, if (ret != 0) { dev_err(map->dev, "Failed to ack 0x%x: %d\n", reg, ret); - goto err_alloc; + goto err_mutex; } } } @@ -901,7 +901,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, if (ret != 0) { dev_err(map->dev, "Failed to set masks in 0x%x: %d\n", reg, ret); - goto err_alloc; + goto err_mutex; } } } @@ -910,7 +910,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, if (chip->status_is_level) { ret = read_irq_data(d); if (ret < 0) - goto err_alloc; + goto err_mutex; memcpy(d->prev_status_buf, d->status_buf, array_size(d->chip->num_regs, sizeof(d->prev_status_buf[0]))); @@ -918,7 +918,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, ret = regmap_irq_create_domain(fwnode, irq_base, chip, d); if (ret) - goto err_alloc; + goto err_mutex; ret = request_threaded_irq(irq, NULL, regmap_irq_thread, irq_flags | IRQF_ONESHOT, @@ -935,6 +935,8 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, err_domain: /* Should really dispose of the domain but... */ +err_mutex: + mutex_destroy(&d->lock); err_alloc: kfree(d->type_buf); kfree(d->type_buf_def); @@ -1027,6 +1029,7 @@ void regmap_del_irq_chip(int irq, struct regmap_irq_chip_data *d) kfree(d->config_buf[i]); kfree(d->config_buf); } + mutex_destroy(&d->lock); kfree(d); } EXPORT_SYMBOL_GPL(regmap_del_irq_chip); -- cgit v1.2.3 From 76b6e14aa7b081337d118a82397d919b5e072bb4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 31 Jul 2025 21:38:19 +0100 Subject: regmap: irq: Avoid lockdep warnings with nested regmap-irq chips While handling interrupts through regmap-irq we use a mutex to protect the updates we are caching while genirq runs in atomic context. Russell King reported that while running on the nVidia Jetson Xavier NX this generates lockdep warnings since that platform has a regmap-irq for the max77686 RTC which is a child of a max77620 which also uses regmap-irq. [ 46.723127] rtcwake/3984 is trying to acquire lock: [ 46.723235] ffff0000813b2c68 (&d->lock){+.+.}-{4:4}, at: regmap_irq_lock+0x18/0x24 [ 46.723452] but task is already holding lock: [ 46.723556] ffff00008504dc68 (&d->lock){+.+.}-{4:4}, at: regmap_irq_lock+0x18/0x24 This happens because by default lockdep uses a single lockdep class for all mutexes initialised from a single mutex_init() call and is unable to tell that two distinct mutex are being taken and verify that the ordering of operations is safe. This should be a very rare situation since normally anything using regmap-irq will be a leaf interrupt controller due to being on a slow bus like I2C. We can avoid these warnings by providing the lockdep key for the regmap-irq explicitly, allocating one for each chip so that lockdep can distinguish between them. Thanks to Russell for the report and analysis. Reported-by: Russell King (Oracle) Tested-by: Russell King (Oracle) Reviewed-by: Russell King (Oracle) Signed-off-by: Mark Brown Link: https://patch.msgid.link/20250731-regmap-irq-nesting-v1-2-98b4d1bf20f0@kernel.org Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-irq.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c index 4aac12d38215..6112d942499b 100644 --- a/drivers/base/regmap/regmap-irq.c +++ b/drivers/base/regmap/regmap-irq.c @@ -21,6 +21,7 @@ struct regmap_irq_chip_data { struct mutex lock; + struct lock_class_key lock_key; struct irq_chip irq_chip; struct regmap *map; @@ -801,7 +802,13 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, goto err_alloc; } - mutex_init(&d->lock); + /* + * If one regmap-irq is the parent of another then we'll try + * to lock the child with the parent locked, use an explicit + * lock_key so lockdep can figure out what's going on. + */ + lockdep_register_key(&d->lock_key); + mutex_init_with_key(&d->lock, &d->lock_key); for (i = 0; i < chip->num_irqs; i++) d->mask_buf_def[chip->irqs[i].reg_offset / map->reg_stride] @@ -937,6 +944,7 @@ err_domain: /* Should really dispose of the domain but... */ err_mutex: mutex_destroy(&d->lock); + lockdep_unregister_key(&d->lock_key); err_alloc: kfree(d->type_buf); kfree(d->type_buf_def); @@ -1030,6 +1038,7 @@ void regmap_del_irq_chip(int irq, struct regmap_irq_chip_data *d) kfree(d->config_buf); } mutex_destroy(&d->lock); + lockdep_unregister_key(&d->lock_key); kfree(d); } EXPORT_SYMBOL_GPL(regmap_del_irq_chip); -- cgit v1.2.3 From 886f42ce96e7ce80545704e7168a9c6b60cd6c03 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 28 Jul 2025 16:08:29 +0100 Subject: regmap: mmio: Add missing MODULE_DESCRIPTION() There were already several commits to add module descriptions to regmap modules. But this one was still missing: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/base/regmap/regmap-mmio.o Signed-off-by: Luis Henriques Link: https://patch.msgid.link/20250728150829.11890-1-luis@igalia.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-mmio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/base') diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c index 99d7fd85ca7d..29e5f3175301 100644 --- a/drivers/base/regmap/regmap-mmio.c +++ b/drivers/base/regmap/regmap-mmio.c @@ -609,4 +609,5 @@ void regmap_mmio_detach_clk(struct regmap *map) } EXPORT_SYMBOL_GPL(regmap_mmio_detach_clk); +MODULE_DESCRIPTION("regmap MMIO Module"); MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 5c36b86d2bf68fbcad16169983ef7ee8c537db59 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 13 Aug 2025 15:07:18 +0200 Subject: regmap: Remove superfluous check for !config in __regmap_init() The first thing __regmap_init() do is check if config is non-NULL, so there is no need to check for this again later. Fixes: d77e745613680c54 ("regmap: Add bulk read/write callbacks into regmap_config") Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/a154d9db0f290dda96b48bd817eb743773e846e1.1755090330.git.geert+renesas@glider.be Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 1f3f782a04ba..6883e1a43fe5 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -827,7 +827,7 @@ struct regmap *__regmap_init(struct device *dev, map->read_flag_mask = bus->read_flag_mask; } - if (config && config->read && config->write) { + if (config->read && config->write) { map->reg_read = _regmap_bus_read; if (config->reg_update_bits) map->reg_update_bits = config->reg_update_bits; -- cgit v1.2.3 From 556c1ad666ad90c50ec8fccb930dd5046cfbecfb Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:42 -0700 Subject: x86/vmscape: Enable the mitigation Enable the previously added mitigation for VMscape. Add the cmdline vmscape={off|ibpb|force} and sysfs reporting. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Dave Hansen --- Documentation/ABI/testing/sysfs-devices-system-cpu | 1 + Documentation/admin-guide/kernel-parameters.txt | 11 +++ arch/x86/Kconfig | 9 +++ arch/x86/kernel/cpu/bugs.c | 90 ++++++++++++++++++++++ drivers/base/cpu.c | 3 + include/linux/cpu.h | 1 + 6 files changed, 115 insertions(+) (limited to 'drivers/base') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index ab8cd337f43a..8aed6d94c4cd 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -586,6 +586,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/srbds /sys/devices/system/cpu/vulnerabilities/tsa /sys/devices/system/cpu/vulnerabilities/tsx_async_abort + /sys/devices/system/cpu/vulnerabilities/vmscape Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 747a55abf494..5a7a83c411e9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3829,6 +3829,7 @@ srbds=off [X86,INTEL] ssbd=force-off [ARM64] tsx_async_abort=off [X86] + vmscape=off [X86] Exceptions: This does not have any effect on @@ -8041,6 +8042,16 @@ vmpoff= [KNL,S390] Perform z/VM CP command after power off. Format: + vmscape= [X86] Controls mitigation for VMscape attacks. + VMscape attacks can leak information from a userspace + hypervisor to a guest via speculative side-channels. + + off - disable the mitigation + ibpb - use Indirect Branch Prediction Barrier + (IBPB) mitigation (default) + force - force vulnerability detection even on + unaffected processors + vsyscall= [X86-64,EARLY] Controls the behavior of vsyscalls (i.e. calls to fixed addresses of 0xffffffffff600x00 from legacy diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100..52c8910ba2ef 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2701,6 +2701,15 @@ config MITIGATION_TSA security vulnerability on AMD CPUs which can lead to forwarding of invalid info to subsequent instructions and thus can affect their timing and thereby cause a leakage. + +config MITIGATION_VMSCAPE + bool "Mitigate VMSCAPE" + depends on KVM + default y + help + Enable mitigation for VMSCAPE attacks. VMSCAPE is a hardware security + vulnerability on Intel and AMD CPUs that may allow a guest to do + Spectre v2 style attacks on userspace hypervisor. endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 410f8df8b77a..c81024dfc4c8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -96,6 +96,9 @@ static void __init its_update_mitigation(void); static void __init its_apply_mitigation(void); static void __init tsa_select_mitigation(void); static void __init tsa_apply_mitigation(void); +static void __init vmscape_select_mitigation(void); +static void __init vmscape_update_mitigation(void); +static void __init vmscape_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -270,6 +273,7 @@ void __init cpu_select_mitigations(void) its_select_mitigation(); bhi_select_mitigation(); tsa_select_mitigation(); + vmscape_select_mitigation(); /* * After mitigations are selected, some may need to update their @@ -301,6 +305,7 @@ void __init cpu_select_mitigations(void) bhi_update_mitigation(); /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ srso_update_mitigation(); + vmscape_update_mitigation(); spectre_v1_apply_mitigation(); spectre_v2_apply_mitigation(); @@ -318,6 +323,7 @@ void __init cpu_select_mitigations(void) its_apply_mitigation(); bhi_apply_mitigation(); tsa_apply_mitigation(); + vmscape_apply_mitigation(); } /* @@ -3322,6 +3328,77 @@ static void __init srso_apply_mitigation(void) } } +#undef pr_fmt +#define pr_fmt(fmt) "VMSCAPE: " fmt + +enum vmscape_mitigations { + VMSCAPE_MITIGATION_NONE, + VMSCAPE_MITIGATION_AUTO, + VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER, + VMSCAPE_MITIGATION_IBPB_ON_VMEXIT, +}; + +static const char * const vmscape_strings[] = { + [VMSCAPE_MITIGATION_NONE] = "Vulnerable", + /* [VMSCAPE_MITIGATION_AUTO] */ + [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace", + [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT", +}; + +static enum vmscape_mitigations vmscape_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE; + +static int __init vmscape_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } else if (!strcmp(str, "ibpb")) { + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_VMSCAPE); + vmscape_mitigation = VMSCAPE_MITIGATION_AUTO; + } else { + pr_err("Ignoring unknown vmscape=%s option.\n", str); + } + + return 0; +} +early_param("vmscape", vmscape_parse_cmdline); + +static void __init vmscape_select_mitigation(void) +{ + if (cpu_mitigations_off() || + !boot_cpu_has_bug(X86_BUG_VMSCAPE) || + !boot_cpu_has(X86_FEATURE_IBPB)) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + return; + } + + if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; +} + +static void __init vmscape_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE)) + return; + + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB || + srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT; + + pr_info("%s\n", vmscape_strings[vmscape_mitigation]); +} + +static void __init vmscape_apply_mitigation(void) +{ + if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER) + setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER); +} + #undef pr_fmt #define pr_fmt(fmt) fmt @@ -3570,6 +3647,11 @@ static ssize_t tsa_show_state(char *buf) return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]); } +static ssize_t vmscape_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -3636,6 +3718,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_TSA: return tsa_show_state(buf); + case X86_BUG_VMSCAPE: + return vmscape_show_state(buf); + default: break; } @@ -3727,6 +3812,11 @@ ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_TSA); } + +ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE); +} #endif void __warn_thunk(void) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index efc575a00edd..008da0354fba 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -603,6 +603,7 @@ CPU_SHOW_VULN_FALLBACK(ghostwrite); CPU_SHOW_VULN_FALLBACK(old_microcode); CPU_SHOW_VULN_FALLBACK(indirect_target_selection); CPU_SHOW_VULN_FALLBACK(tsa); +CPU_SHOW_VULN_FALLBACK(vmscape); static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); @@ -622,6 +623,7 @@ static DEVICE_ATTR(ghostwrite, 0444, cpu_show_ghostwrite, NULL); static DEVICE_ATTR(old_microcode, 0444, cpu_show_old_microcode, NULL); static DEVICE_ATTR(indirect_target_selection, 0444, cpu_show_indirect_target_selection, NULL); static DEVICE_ATTR(tsa, 0444, cpu_show_tsa, NULL); +static DEVICE_ATTR(vmscape, 0444, cpu_show_vmscape, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -642,6 +644,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_old_microcode.attr, &dev_attr_indirect_target_selection.attr, &dev_attr_tsa.attr, + &dev_attr_vmscape.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b91b993f58ee..487b3bf2e1ea 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -83,6 +83,7 @@ extern ssize_t cpu_show_old_microcode(struct device *dev, extern ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From 292cb391479d50f4379a0abab34324de92c82a92 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 13 Aug 2025 23:03:52 -0700 Subject: software node: Constify node_group in registration functions The software_node_register_node_group() and software_node_unregister_node_group() functions take in essence an array of pointers to software_node structs. Since the functions do not modify the array declare the argument as constant, so that static arrays can be declared as const and annotated as __initconst. Signed-off-by: Dmitry Torokhov Link: https://lore.kernel.org/r/2zny5grbgtwbplynxffxg6dkgjgqf45aigwmgxio5stesdr3wi@gf2zamk5amic Signed-off-by: Greg Kroah-Hartman --- drivers/base/swnode.c | 5 ++--- include/linux/property.h | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index deda7f35a059..be1e9e61a7bf 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -844,7 +844,7 @@ swnode_register(const struct software_node *node, struct swnode *parent, * of this function or by ordering the array such that parent comes before * child. */ -int software_node_register_node_group(const struct software_node **node_group) +int software_node_register_node_group(const struct software_node * const *node_group) { unsigned int i; int ret; @@ -877,8 +877,7 @@ EXPORT_SYMBOL_GPL(software_node_register_node_group); * remove the nodes individually, in the correct order (child before * parent). */ -void software_node_unregister_node_group( - const struct software_node **node_group) +void software_node_unregister_node_group(const struct software_node * const *node_group) { unsigned int i = 0; diff --git a/include/linux/property.h b/include/linux/property.h index 82f0cb3abd1e..d1e80b3c9918 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -574,8 +574,8 @@ const struct software_node * software_node_find_by_name(const struct software_node *parent, const char *name); -int software_node_register_node_group(const struct software_node **node_group); -void software_node_unregister_node_group(const struct software_node **node_group); +int software_node_register_node_group(const struct software_node * const *node_group); +void software_node_unregister_node_group(const struct software_node * const *node_group); int software_node_register(const struct software_node *node); void software_node_unregister(const struct software_node *node); -- cgit v1.2.3 From e246518aa24f1460902725e97d0abf574aec6ade Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 26 Aug 2025 13:43:47 +0200 Subject: PM: sleep: annotate RCU list iterations These iterations require the read lock, otherwise RCU lockdep will splat: ============================= WARNING: suspicious RCU usage 6.17.0-rc3-00014-g31419c045d64 #6 Tainted: G O ----------------------------- drivers/base/power/main.c:1333 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 5 locks held by rtcwake/547: #0: 00000000643ab418 (sb_writers#6){.+.+}-{0:0}, at: file_start_write+0x2b/0x3a #1: 0000000067a0ca88 (&of->mutex#2){+.+.}-{4:4}, at: kernfs_fop_write_iter+0x181/0x24b #2: 00000000631eac40 (kn->active#3){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x191/0x24b #3: 00000000609a1308 (system_transition_mutex){+.+.}-{4:4}, at: pm_suspend+0xaf/0x30b #4: 0000000060c0fdb0 (device_links_srcu){.+.+}-{0:0}, at: device_links_read_lock+0x75/0x98 stack backtrace: CPU: 0 UID: 0 PID: 547 Comm: rtcwake Tainted: G O 6.17.0-rc3-00014-g31419c045d64 #6 VOLUNTARY Tainted: [O]=OOT_MODULE Stack: 223721b3a80 6089eac6 00000001 00000001 ffffff00 6089eac6 00000535 6086e528 721b3ac0 6003c294 00000000 60031fc0 Call Trace: [<600407ed>] show_stack+0x10e/0x127 [<6003c294>] dump_stack_lvl+0x77/0xc6 [<6003c2fd>] dump_stack+0x1a/0x20 [<600bc2f8>] lockdep_rcu_suspicious+0x116/0x13e [<603d8ea1>] dpm_async_suspend_superior+0x117/0x17e [<603d980f>] device_suspend+0x528/0x541 [<603da24b>] dpm_suspend+0x1a2/0x267 [<603da837>] dpm_suspend_start+0x5d/0x72 [<600ca0c9>] suspend_devices_and_enter+0xab/0x736 [...] Add the fourth argument to the iteration to annotate this and avoid the splat. Fixes: 06799631d522 ("PM: sleep: Make async suspend handle suppliers like parents") Fixes: ed18738fff02 ("PM: sleep: Make async resume handle consumers like children") Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20250826134348.aba79f6e6299.I9ecf55da46ccf33778f2c018a82e1819d815b348@changeid Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index dbf5456cd891..2ea6e05e6ec9 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -675,7 +675,7 @@ static void dpm_async_resume_subordinate(struct device *dev, async_func_t func) idx = device_links_read_lock(); /* Start processing the device's "async" consumers. */ - list_for_each_entry_rcu(link, &dev->links.consumers, s_node) + list_for_each_entry_rcu_locked(link, &dev->links.consumers, s_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->consumer, func); @@ -1330,7 +1330,7 @@ static void dpm_async_suspend_superior(struct device *dev, async_func_t func) idx = device_links_read_lock(); /* Start processing the device's "async" suppliers. */ - list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) + list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->supplier, func); -- cgit v1.2.3 From f4672dc6e9c07643c8c755856ba8e9eb9ca95d0c Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Thu, 28 Aug 2025 23:07:01 +0800 Subject: regmap: use int type to store negative error codes Change the 'ret' variable from unsigned int to int to store negative error codes or zero returned by regmap_field_read() and regmap_read(), and change '-1' to 'negative errno' in the comments. Storing the negative error codes in unsigned type, doesn't cause an issue at runtime but it's ugly as pants. Additionally, assigning negative error codes to unsigned type may trigger a GCC warning when the -Wsign-conversion flag is enabled. No effect on runtime. Signed-off-by: Qianfeng Rong Message-ID: <20250828150702.193288-1-rongqianfeng@vivo.com> Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 6883e1a43fe5..ce9be3989a21 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -2258,12 +2258,14 @@ EXPORT_SYMBOL_GPL(regmap_field_update_bits_base); * @field: Register field to operate on * @bits: Bits to test * - * Returns -1 if the underlying regmap_field_read() fails, 0 if at least one of the - * tested bits is not set and 1 if all tested bits are set. + * Returns negative errno if the underlying regmap_field_read() fails, + * 0 if at least one of the tested bits is not set and 1 if all tested + * bits are set. */ int regmap_field_test_bits(struct regmap_field *field, unsigned int bits) { - unsigned int val, ret; + unsigned int val; + int ret; ret = regmap_field_read(field, &val); if (ret) @@ -3309,7 +3311,8 @@ EXPORT_SYMBOL_GPL(regmap_update_bits_base); */ int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits) { - unsigned int val, ret; + unsigned int val; + int ret; ret = regmap_read(map, reg, &val); if (ret) -- cgit v1.2.3 From b57fc652ca24ada3b0c888327f9944ed21559286 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:05 -0700 Subject: drivers/base/node: Add a helper function node_update_perf_attrs() Add helper function node_update_perf_attrs() to allow update of node access coordinates computed by an external agent such as CXL. The helper allows updating of coordinates after the attribute being created by HMAT. Acked-by: David Hildenbrand Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/base/node.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/node.h | 8 ++++++++ 2 files changed, 46 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/node.c b/drivers/base/node.c index 3399594136b2..db18a1c32637 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -248,6 +248,44 @@ void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, } EXPORT_SYMBOL_GPL(node_set_perf_attrs); +/** + * node_update_perf_attrs - Update the performance values for given access class + * @nid: Node identifier to be updated + * @coord: Heterogeneous memory performance coordinates + * @access: The access class for the given attributes + */ +void node_update_perf_attrs(unsigned int nid, struct access_coordinate *coord, + enum access_coordinate_class access) +{ + struct node_access_nodes *access_node; + struct node *node; + int i; + + if (WARN_ON_ONCE(!node_online(nid))) + return; + + node = node_devices[nid]; + list_for_each_entry(access_node, &node->access_list, list_node) { + if (access_node->access != access) + continue; + + access_node->coord = *coord; + for (i = 0; access_attrs[i]; i++) { + sysfs_notify(&access_node->dev.kobj, + NULL, access_attrs[i]->name); + } + break; + } + + /* When setting CPU access coordinates, update mempolicy */ + if (access != ACCESS_COORDINATE_CPU) + return; + + if (mempolicy_set_node_perf(nid, coord)) + pr_info("failed to set mempolicy attrs for node %d\n", nid); +} +EXPORT_SYMBOL_GPL(node_update_perf_attrs); + /** * struct node_cache_info - Internal tracking for memory node caches * @dev: Device represeting the cache level diff --git a/include/linux/node.h b/include/linux/node.h index 2c7529335b21..866e3323f1fd 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -85,6 +85,8 @@ struct node_cache_attrs { void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs); void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, enum access_coordinate_class access); +void node_update_perf_attrs(unsigned int nid, struct access_coordinate *coord, + enum access_coordinate_class access); #else static inline void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs) @@ -96,6 +98,12 @@ static inline void node_set_perf_attrs(unsigned int nid, enum access_coordinate_class access) { } + +static inline void node_update_perf_attrs(unsigned int nid, + struct access_coordinate *coord, + enum access_coordinate_class access) +{ +} #endif struct node { -- cgit v1.2.3 From be82483d1b60baf6747884bd74cb7de484deaf76 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Sep 2025 15:55:45 +0200 Subject: PM: sleep: core: Clear power.must_resume in noirq suspend error path If system suspend is aborted in the "noirq" phase (for instance, due to an error returned by one of the device callbacks), power.is_noirq_suspended will not be set for some devices and device_resume_noirq() will return early for them. Consequently, noirq resume callbacks will not run for them at all because the noirq suspend callbacks have not run for them yet. If any of them has power.must_resume set and late suspend has been skipped for it (due to power.smart_suspend), early resume should be skipped for it either, or its state may become inconsistent (for instance, if the early resume assumes that it will always follow noirq resume). Make that happen by clearing power.must_resume in device_resume_noirq() for devices with power.is_noirq_suspended clear that have been left in suspend by device_suspend_late(), which will subsequently cause device_resume_early() to leave the device in suspend and avoid changing its state. Fixes: 0d4b54c6fee8 ("PM / core: Add LEAVE_SUSPENDED driver flag") Link: https://lore.kernel.org/linux-pm/5d692b81-6f58-4e86-9cb0-ede69a09d799@rowland.harvard.edu/ Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/3381776.aeNJFYEL58@rafael.j.wysocki --- drivers/base/power/main.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 2ea6e05e6ec9..c883b01ffbdd 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -724,8 +724,20 @@ static void device_resume_noirq(struct device *dev, pm_message_t state, bool asy if (dev->power.syscore || dev->power.direct_complete) goto Out; - if (!dev->power.is_noirq_suspended) + if (!dev->power.is_noirq_suspended) { + /* + * This means that system suspend has been aborted in the noirq + * phase before invoking the noirq suspend callback for the + * device, so if device_suspend_late() has left it in suspend, + * device_resume_early() should leave it in suspend either in + * case the early resume of it depends on the noirq resume that + * has not run. + */ + if (dev_pm_skip_suspend(dev)) + dev->power.must_resume = false; + goto Out; + } if (!dpm_wait_for_superior(dev, async)) goto Out; -- cgit v1.2.3 From 1ad926459970444af1140f9b393f416536e1a828 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Aug 2025 12:56:41 +0200 Subject: driver core: faux: Set power.no_pm for faux devices Since faux devices are not supposed to be involved in any kind of power management, set the no_pm flag for all of them. Signed-off-by: Rafael J. Wysocki Cc: stable Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/6206518.lOV4Wx5bFT@rafael.j.wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/base/faux.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/base') diff --git a/drivers/base/faux.c b/drivers/base/faux.c index f5fbda0a9a44..21dd02124231 100644 --- a/drivers/base/faux.c +++ b/drivers/base/faux.c @@ -155,6 +155,7 @@ struct faux_device *faux_device_create_with_groups(const char *name, dev->parent = &faux_bus_root; dev->bus = &faux_bus_type; dev_set_name(dev, "%s", name); + device_set_pm_not_required(dev); ret = device_add(dev); if (ret) { -- cgit v1.2.3 From fdd9ae23bb989fa9ed1beebba7d3e0c82c7c81ae Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Sep 2025 15:43:50 +0200 Subject: PM: core: Annotate loops walking device links as _srcu Since SRCU is used for the protection of device link lists, the loops over device link lists in multiple places in drivers/base/power/main.c and in pm_runtime_get_suppliers() should be annotated as _srcu rather than as _rcu which is the case currently. Change the annotations accordingly. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2393512.ElGaqSPkdT@rafael.j.wysocki --- drivers/base/power/main.c | 18 +++++++++--------- drivers/base/power/runtime.c | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 2ea6e05e6ec9..2fa53896db82 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -40,8 +40,8 @@ typedef int (*pm_callback_t)(struct device *); -#define list_for_each_entry_rcu_locked(pos, head, member) \ - list_for_each_entry_rcu(pos, head, member, \ +#define list_for_each_entry_srcu_locked(pos, head, member) \ + list_for_each_entry_srcu(pos, head, member, \ device_links_read_lock_held()) /* @@ -281,7 +281,7 @@ static void dpm_wait_for_suppliers(struct device *dev, bool async) * callbacks freeing the link objects for the links in the list we're * walking. */ - list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) + list_for_each_entry_srcu_locked(link, &dev->links.suppliers, c_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->supplier, async); @@ -338,7 +338,7 @@ static void dpm_wait_for_consumers(struct device *dev, bool async) * continue instead of trying to continue in parallel with its * unregistration). */ - list_for_each_entry_rcu_locked(link, &dev->links.consumers, s_node) + list_for_each_entry_srcu_locked(link, &dev->links.consumers, s_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->consumer, async); @@ -675,7 +675,7 @@ static void dpm_async_resume_subordinate(struct device *dev, async_func_t func) idx = device_links_read_lock(); /* Start processing the device's "async" consumers. */ - list_for_each_entry_rcu_locked(link, &dev->links.consumers, s_node) + list_for_each_entry_srcu_locked(link, &dev->links.consumers, s_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->consumer, func); @@ -1330,7 +1330,7 @@ static void dpm_async_suspend_superior(struct device *dev, async_func_t func) idx = device_links_read_lock(); /* Start processing the device's "async" suppliers. */ - list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) + list_for_each_entry_srcu_locked(link, &dev->links.suppliers, c_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->supplier, func); @@ -1384,7 +1384,7 @@ static void dpm_superior_set_must_resume(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) + list_for_each_entry_srcu_locked(link, &dev->links.suppliers, c_node) link->supplier->power.must_resume = true; device_links_read_unlock(idx); @@ -1813,7 +1813,7 @@ static void dpm_clear_superiors_direct_complete(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { + list_for_each_entry_srcu_locked(link, &dev->links.suppliers, c_node) { spin_lock_irq(&link->supplier->power.lock); link->supplier->power.direct_complete = false; spin_unlock_irq(&link->supplier->power.lock); @@ -2065,7 +2065,7 @@ static bool device_prepare_smart_suspend(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { + list_for_each_entry_srcu_locked(link, &dev->links.suppliers, c_node) { if (!device_link_test(link, DL_FLAG_PM_RUNTIME)) continue; diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 3e84dc4122de..8c23a11e8017 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1903,8 +1903,8 @@ void pm_runtime_get_suppliers(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, - device_links_read_lock_held()) + list_for_each_entry_srcu(link, &dev->links.suppliers, c_node, + device_links_read_lock_held()) if (device_link_test(link, DL_FLAG_PM_RUNTIME)) { link->supplier_preactivated = true; pm_runtime_get_sync(link->supplier); -- cgit v1.2.3 From 3ce3f569991347d2085925041f4932232da43bcf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Sep 2025 15:45:14 +0200 Subject: PM: core: Add two macros for walking device links Add separate macros for walking links to suppliers and consumers of a device to help device links users to avoid exposing the internals of struct dev_links_info in their code and possible coding mistakes related to that. Accordingly, use the new macros to replace open-coded device links list walks in the core power management code. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/1944671.tdWV9SEqCh@rafael.j.wysocki --- drivers/base/base.h | 8 ++++++++ drivers/base/power/main.c | 18 +++++++----------- drivers/base/power/runtime.c | 3 +-- 3 files changed, 16 insertions(+), 13 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/base.h b/drivers/base/base.h index 123031a757d9..700aecd22fd3 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -251,6 +251,14 @@ void device_links_unbind_consumers(struct device *dev); void fw_devlink_drivers_done(void); void fw_devlink_probing_done(void); +#define dev_for_each_link_to_supplier(__link, __dev) \ + list_for_each_entry_srcu(__link, &(__dev)->links.suppliers, c_node, \ + device_links_read_lock_held()) + +#define dev_for_each_link_to_consumer(__link, __dev) \ + list_for_each_entry_srcu(__link, &(__dev)->links.consumers, s_node, \ + device_links_read_lock_held()) + /* device pm support */ void device_pm_move_to_tail(struct device *dev); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 2fa53896db82..e3a7f25bf8d3 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -40,10 +40,6 @@ typedef int (*pm_callback_t)(struct device *); -#define list_for_each_entry_srcu_locked(pos, head, member) \ - list_for_each_entry_srcu(pos, head, member, \ - device_links_read_lock_held()) - /* * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and @@ -281,7 +277,7 @@ static void dpm_wait_for_suppliers(struct device *dev, bool async) * callbacks freeing the link objects for the links in the list we're * walking. */ - list_for_each_entry_srcu_locked(link, &dev->links.suppliers, c_node) + dev_for_each_link_to_supplier(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->supplier, async); @@ -338,7 +334,7 @@ static void dpm_wait_for_consumers(struct device *dev, bool async) * continue instead of trying to continue in parallel with its * unregistration). */ - list_for_each_entry_srcu_locked(link, &dev->links.consumers, s_node) + dev_for_each_link_to_consumer(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->consumer, async); @@ -675,7 +671,7 @@ static void dpm_async_resume_subordinate(struct device *dev, async_func_t func) idx = device_links_read_lock(); /* Start processing the device's "async" consumers. */ - list_for_each_entry_srcu_locked(link, &dev->links.consumers, s_node) + dev_for_each_link_to_consumer(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->consumer, func); @@ -1330,7 +1326,7 @@ static void dpm_async_suspend_superior(struct device *dev, async_func_t func) idx = device_links_read_lock(); /* Start processing the device's "async" suppliers. */ - list_for_each_entry_srcu_locked(link, &dev->links.suppliers, c_node) + dev_for_each_link_to_supplier(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->supplier, func); @@ -1384,7 +1380,7 @@ static void dpm_superior_set_must_resume(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_srcu_locked(link, &dev->links.suppliers, c_node) + dev_for_each_link_to_supplier(link, dev) link->supplier->power.must_resume = true; device_links_read_unlock(idx); @@ -1813,7 +1809,7 @@ static void dpm_clear_superiors_direct_complete(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_srcu_locked(link, &dev->links.suppliers, c_node) { + dev_for_each_link_to_supplier(link, dev) { spin_lock_irq(&link->supplier->power.lock); link->supplier->power.direct_complete = false; spin_unlock_irq(&link->supplier->power.lock); @@ -2065,7 +2061,7 @@ static bool device_prepare_smart_suspend(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_srcu_locked(link, &dev->links.suppliers, c_node) { + dev_for_each_link_to_supplier(link, dev) { if (!device_link_test(link, DL_FLAG_PM_RUNTIME)) continue; diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 8c23a11e8017..7420b9851fe0 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1903,8 +1903,7 @@ void pm_runtime_get_suppliers(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_srcu(link, &dev->links.suppliers, c_node, - device_links_read_lock_held()) + dev_for_each_link_to_supplier(link, dev) if (device_link_test(link, DL_FLAG_PM_RUNTIME)) { link->supplier_preactivated = true; pm_runtime_get_sync(link->supplier); -- cgit v1.2.3 From 2b2d4c744e1ab8b8d41c5c2ccf889f5441e50105 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Wed, 27 Aug 2025 19:40:21 +0800 Subject: drivers: base: fix "publically"->"publicly" Trivial fix to spelling mistake in comment text. Signed-off-by: Xichao Zhao Link: https://lore.kernel.org/r/20250827114021.476668-1-zhao.xichao@vivo.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index efc575a00edd..ccaebb8ac656 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -325,7 +325,7 @@ static void cpu_device_release(struct device *dev) * This is an empty function to prevent the driver core from spitting a * warning at us. Yes, I know this is directly opposite of what the * documentation for the driver core and kobjects say, and the author - * of this code has already been publically ridiculed for doing + * of this code has already been publicly ridiculed for doing * something as foolish as this. However, at this point in time, it is * the only way to handle the issue of statically allocated cpu * devices. The different architectures will have their cpu device -- cgit v1.2.3 From a86537ad21c7ba587241b0dcdd186f894a69fac7 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Fri, 29 Aug 2025 22:58:57 +0200 Subject: driver core: get_dev_from_fwnode(): document potential race Commit 9a4681a485ee ("driver core: Export get_dev_from_fwnode()") made get_dev_from_fwnode() publicly available, but didn't document the guarantees a caller must uphold: get_dev_from_fwnode() obtains a reference count from the device pointer stored in a struct fwnode_handle. While having its own reference count, struct fwnode_handle does not keep a reference count of the device it has a pointer to. Consequently, a caller must guarantee that it is impossible that the last device reference is dropped and the device is released concurrently while calling get_dev_from_fwnode(), otherwise this is a potential UAF and hence a bug. Thus, document this potential race condition for get_dev_from_fwnode(). Cc: Ulf Hansson Cc: Saravana Kannan Signed-off-by: Danilo Krummrich Link: https://lore.kernel.org/r/20250829205911.33142-1-dakr@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index d22d6b23e758..cd806be435cd 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -5278,6 +5278,25 @@ void device_set_node(struct device *dev, struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(device_set_node); +/** + * get_dev_from_fwnode - Obtain a reference count of the struct device the + * struct fwnode_handle is associated with. + * @fwnode: The pointer to the struct fwnode_handle to obtain the struct device + * reference count of. + * + * This function obtains a reference count of the device the device pointer + * embedded in the struct fwnode_handle points to. + * + * Note that the struct device pointer embedded in struct fwnode_handle does + * *not* have a reference count of the struct device itself. + * + * Hence, it is a UAF (and thus a bug) to call this function if the caller can't + * guarantee that the last reference count of the corresponding struct device is + * not dropped concurrently. + * + * This is possible since struct fwnode_handle has its own reference count and + * hence can out-live the struct device it is associated with. + */ struct device *get_dev_from_fwnode(struct fwnode_handle *fwnode) { return get_device((fwnode)->dev); -- cgit v1.2.3 From 716cec5fc92f6d4090a54ddb01042bce02b3c771 Mon Sep 17 00:00:00 2001 From: Gil Fine Date: Sun, 31 Aug 2025 22:49:30 +0300 Subject: driver core: Fix order of the kernel-doc parameters Fix the order of the kernel-doc parameters in device_find_child() and device_for_each_child*() functions to match the actual functions signature. No functional changes. Signed-off-by: Gil Fine Link: https://lore.kernel.org/r/20250831194930.2063390-1-gil.fine@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index cd806be435cd..fa8093119602 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3994,8 +3994,8 @@ const char *device_get_devnode(const struct device *dev, /** * device_for_each_child - device child iterator. * @parent: parent struct device. - * @fn: function to be called for each device. * @data: data for the callback. + * @fn: function to be called for each device. * * Iterate over @parent's child devices, and call @fn for each, * passing it @data. @@ -4024,8 +4024,8 @@ EXPORT_SYMBOL_GPL(device_for_each_child); /** * device_for_each_child_reverse - device child iterator in reversed order. * @parent: parent struct device. - * @fn: function to be called for each device. * @data: data for the callback. + * @fn: function to be called for each device. * * Iterate over @parent's child devices, and call @fn for each, * passing it @data. @@ -4055,8 +4055,8 @@ EXPORT_SYMBOL_GPL(device_for_each_child_reverse); * device_for_each_child_reverse_from - device child iterator in reversed order. * @parent: parent struct device. * @from: optional starting point in child list - * @fn: function to be called for each device. * @data: data for the callback. + * @fn: function to be called for each device. * * Iterate over @parent's child devices, starting at @from, and call @fn * for each, passing it @data. This helper is identical to @@ -4089,8 +4089,8 @@ EXPORT_SYMBOL_GPL(device_for_each_child_reverse_from); /** * device_find_child - device iterator for locating a particular device. * @parent: parent struct device - * @match: Callback function to check device * @data: Data to pass to match function + * @match: Callback function to check device * * This is similar to the device_for_each_child() function above, but it * returns a reference to a device that is 'found' for later use, as -- cgit v1.2.3 From eca710386972f2a72708b0b2a615eb150d218e18 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 27 Aug 2025 13:05:41 +0300 Subject: driver core: auxiliary bus: Drop dev_pm_domain_detach() call Starting with commit f99508074e78 ("PM: domains: Detach on device_unbind_cleanup()"), there is no longer a need to call dev_pm_domain_detach() in the bus remove function. The device_unbind_cleanup() function now handles this to avoid invoking devres cleanup handlers while the PM domain is powered off, which could otherwise lead to failures as described in the above-mentioned commit. Drop the explicit dev_pm_domain_detach() call and rely instead on the flags passed to dev_pm_domain_attach() to power off the domain. Signed-off-by: Claudiu Beznea Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20250827100541.926350-1-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/auxiliary.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index 12ffdd843756..f65ba30d0617 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -217,17 +217,14 @@ static int auxiliary_bus_probe(struct device *dev) struct auxiliary_device *auxdev = to_auxiliary_dev(dev); int ret; - ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); + ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON | + PD_FLAG_DETACH_POWER_OFF); if (ret) { dev_warn(dev, "Failed to attach to PM Domain : %d\n", ret); return ret; } - ret = auxdrv->probe(auxdev, auxiliary_match_id(auxdrv->id_table, auxdev)); - if (ret) - dev_pm_domain_detach(dev, true); - - return ret; + return auxdrv->probe(auxdev, auxiliary_match_id(auxdrv->id_table, auxdev)); } static void auxiliary_bus_remove(struct device *dev) @@ -237,7 +234,6 @@ static void auxiliary_bus_remove(struct device *dev) if (auxdrv->remove) auxdrv->remove(auxdev); - dev_pm_domain_detach(dev, true); } static void auxiliary_bus_shutdown(struct device *dev) -- cgit v1.2.3 From 4c48aed6dfcd32ea23e52adc1072405a62facf46 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Wed, 3 Sep 2025 19:37:22 +0800 Subject: driver core: auxiliary bus: Optimize logic of auxiliary_match_id() auxiliary_match_id() repeatedly calculates variable @match_size in the for loop, however, the variable is fixed actually, so it is enough to only calculate the variable once. Besides, the function should return directly if name of the @auxdev does not include '.', but it still iterates over the ID table. Additionally, statement 'dev_name(&auxdev->dev)' is fixed, but may be evaluated more than 3 times. Optimize logic of the function by: - Move the logic calculating the variable out of the for loop - Return NULL directly if @p == NULL - Give the statement an dedicated local variable @auxdev_name Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250903-fix_auxbus-v2-1-3eae8374fd65@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/auxiliary.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index f65ba30d0617..04bdbff4dbe5 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -171,17 +171,18 @@ static const struct auxiliary_device_id *auxiliary_match_id(const struct auxiliary_device_id *id, const struct auxiliary_device *auxdev) { - for (; id->name[0]; id++) { - const char *p = strrchr(dev_name(&auxdev->dev), '.'); - int match_size; + const char *auxdev_name = dev_name(&auxdev->dev); + const char *p = strrchr(auxdev_name, '.'); + int match_size; - if (!p) - continue; - match_size = p - dev_name(&auxdev->dev); + if (!p) + return NULL; + match_size = p - auxdev_name; + for (; id->name[0]; id++) { /* use dev_name(&auxdev->dev) prefix before last '.' char to match to */ if (strlen(id->name) == match_size && - !strncmp(dev_name(&auxdev->dev), id->name, match_size)) + !strncmp(auxdev_name, id->name, match_size)) return id; } return NULL; -- cgit v1.2.3 From d364d2ad07873dc4991b2a631a8536597272418b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:11 +0200 Subject: devres: provide devm_kmemdup_const() Provide a function similar to devm_strdup_const() but for copying blocks of memory that are likely to be placed in .rodata. Reviewed-by: Andy Shevchenko Acked-by: Greg Kroah-Hartman Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/base/devres.c | 21 +++++++++++++++++++++ include/linux/device/devres.h | 2 ++ 2 files changed, 23 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/devres.c b/drivers/base/devres.c index ff55e1bcfa30..c948c88d3956 100644 --- a/drivers/base/devres.c +++ b/drivers/base/devres.c @@ -1117,6 +1117,27 @@ void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL_GPL(devm_kmemdup); +/** + * devm_kmemdup_const - conditionally duplicate and manage a region of memory + * + * @dev: Device this memory belongs to + * @src: memory region to duplicate + * @len: memory region length, + * @gfp: GFP mask to use + * + * Return: source address if it is in .rodata or the return value of kmemdup() + * to which the function falls back otherwise. + */ +const void * +devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp) +{ + if (is_kernel_rodata((unsigned long)src)) + return src; + + return devm_kmemdup(dev, src, len, gfp); +} +EXPORT_SYMBOL_GPL(devm_kmemdup_const); + struct pages_devres { unsigned long addr; unsigned int order; diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h index ae696d10faff..8c5f57e0d613 100644 --- a/include/linux/device/devres.h +++ b/include/linux/device/devres.h @@ -80,6 +80,8 @@ void devm_kfree(struct device *dev, const void *p); void * __realloc_size(3) devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); +const void * +devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp); static inline void *devm_kmemdup_array(struct device *dev, const void *src, size_t n, size_t size, gfp_t flags) { -- cgit v1.2.3 From 786eb990cfb78aab94eb74fb32a030e14723a620 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Fri, 22 Aug 2025 14:18:45 +0530 Subject: drivers/base/node: handle error properly in register_one_node() If register_node() returns an error, it is not handled correctly. The function will proceed further and try to register CPUs under the node, which is not correct. So, in this patch, if register_node() returns an error, we return immediately from the function. Link: https://lkml.kernel.org/r/20250822084845.19219-1-donettom@linux.ibm.com Fixes: 76b67ed9dce6 ("[PATCH] node hotplug: register cpu: remove node struct") Signed-off-by: Donet Tom Acked-by: David Hildenbrand Cc: Alison Schofield Cc: Danilo Krummrich Cc: Dave Jiang Cc: Donet Tom Cc: Greg Kroah-Hartman Cc: Hiroyouki Kamezawa Cc: Joanthan Cameron Cc: Oscar Salvador Cc: "Ritesh Harjani (IBM)" Cc: Yury Norov (NVIDIA) Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/base/node.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/node.c b/drivers/base/node.c index 3399594136b2..45d512939c40 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -885,6 +885,11 @@ int register_one_node(int nid) node_devices[nid] = node; error = register_node(node_devices[nid], nid); + if (error) { + node_devices[nid] = NULL; + kfree(node); + return error; + } /* link cpu under this node */ for_each_present_cpu(cpu) { -- cgit v1.2.3 From 3d18f80ce181ba27f37d0ec1c550b22acb01dd49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:52 +1000 Subject: VFS: rename kern_path_locked() and related functions. kern_path_locked() is now only used to prepare for removing an object from the filesystem (and that is the only credible reason for wanting a positive locked dentry). Thus it corresponds to kern_path_create() and so should have a corresponding name. Unfortunately the name "kern_path_create" is somewhat misleading as it doesn't actually create anything. The recently added simple_start_creating() provides a better pattern I believe. The "start" can be matched with "end" to bracket the creating or removing. So this patch changes names: kern_path_locked -> start_removing_path kern_path_create -> start_creating_path user_path_create -> start_creating_user_path user_path_locked_at -> start_removing_user_path_at done_path_create -> end_creating_path and also introduces end_removing_path() which is identical to end_creating_path(). __start_removing_path (which was __kern_path_locked) is enhanced to call mnt_want_write() for consistency with the start_creating_path(). Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 12 ++++++ arch/powerpc/platforms/cell/spufs/syscalls.c | 4 +- drivers/base/devtmpfs.c | 22 +++++------ fs/bcachefs/fs-ioctl.c | 10 ++--- fs/init.c | 17 ++++---- fs/namei.c | 59 +++++++++++++++++----------- fs/ocfs2/refcounttree.c | 4 +- fs/smb/server/vfs.c | 8 ++-- include/linux/namei.h | 14 ++++--- kernel/bpf/inode.c | 4 +- net/unix/af_unix.c | 6 +-- 11 files changed, 93 insertions(+), 67 deletions(-) (limited to 'drivers/base') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f07..e0494860be6b 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1285,3 +1285,15 @@ rather than a VMA, as the VMA at this stage is not yet valid. The vm_area_desc provides the minimum required information for a filesystem to initialise state upon memory mapping of a file-backed region, and output parameters for the file system to set this state. + +--- + +**mandatory** + +Several functions are renamed: + +- kern_path_locked -> start_removing_path +- kern_path_create -> start_creating_path +- user_path_create -> start_creating_user_path +- user_path_locked_at -> start_removing_user_path_at +- done_path_create -> end_creating_path diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 157e046e6e93..ea4ba1b6ce6a 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -67,11 +67,11 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, struct dentry *dentry; int ret; - dentry = user_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_user_path(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); ret = PTR_ERR(dentry); if (!IS_ERR(dentry)) { ret = spufs_create(&path, dentry, flags, mode, neighbor); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); } return ret; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 31bfb3194b4c..9d4e46ad8352 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -176,7 +176,7 @@ static int dev_mkdir(const char *name, umode_t mode) struct dentry *dentry; struct path path; - dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -184,7 +184,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return PTR_ERR_OR_ZERO(dentry); } @@ -222,10 +222,10 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, struct path path; int err; - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -246,7 +246,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -256,7 +256,7 @@ static int dev_rmdir(const char *name) struct dentry *dentry; int err; - dentry = kern_path_locked(name, &parent); + dentry = start_removing_path(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_inode(dentry)->i_private == &thread) @@ -265,9 +265,7 @@ static int dev_rmdir(const char *name) else err = -EPERM; - dput(dentry); - inode_unlock(d_inode(parent.dentry)); - path_put(&parent); + end_removing_path(&parent, dentry); return err; } @@ -325,7 +323,7 @@ static int handle_remove(const char *nodename, struct device *dev) int deleted = 0; int err = 0; - dentry = kern_path_locked(nodename, &parent); + dentry = start_removing_path(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -349,10 +347,8 @@ static int handle_remove(const char *nodename, struct device *dev) if (!err || err == -ENOENT) deleted = 1; } - dput(dentry); - inode_unlock(d_inode(parent.dentry)); + end_removing_path(&parent, dentry); - path_put(&parent); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 4e72e654da96..43510da5e734 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -255,7 +255,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); } - dst_dentry = user_path_create(arg.dirfd, + dst_dentry = start_creating_user_path(arg.dirfd, (const char __user *)(unsigned long)arg.dst_ptr, &dst_path, lookup_flags); error = PTR_ERR_OR_ZERO(dst_dentry); @@ -314,7 +314,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, d_instantiate(dst_dentry, &inode->v); fsnotify_mkdir(dir, dst_dentry); err3: - done_path_create(&dst_path, dst_dentry); + end_creating_path(&dst_path, dst_dentry); err2: if (arg.src_ptr) path_put(&src_path); @@ -334,7 +334,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, if (arg.flags) return -EINVAL; - victim = user_path_locked_at(arg.dirfd, name, &path); + victim = start_removing_user_path_at(arg.dirfd, name, &path); if (IS_ERR(victim)) return PTR_ERR(victim); @@ -351,9 +351,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, d_invalidate(victim); } err: - inode_unlock(dir); - dput(victim); - path_put(&path); + end_removing_path(&path, victim); return ret; } diff --git a/fs/init.c b/fs/init.c index eef5124885e3..07f592ccdba8 100644 --- a/fs/init.c +++ b/fs/init.c @@ -149,7 +149,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) else if (!(S_ISBLK(mode) || S_ISCHR(mode))) return -EINVAL; - dentry = kern_path_create(AT_FDCWD, filename, &path, 0); + dentry = start_creating_path(AT_FDCWD, filename, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -158,7 +158,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -173,7 +173,7 @@ int __init init_link(const char *oldname, const char *newname) if (error) return error; - new_dentry = kern_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out; @@ -191,7 +191,7 @@ int __init init_link(const char *oldname, const char *newname) error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, NULL); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); return error; @@ -203,14 +203,14 @@ int __init init_symlink(const char *oldname, const char *newname) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, newname, &path, 0); + dentry = start_creating_path(AT_FDCWD, newname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, oldname); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -225,7 +225,8 @@ int __init init_mkdir(const char *pathname, umode_t mode) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, pathname, &path, + LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); mode = mode_strip_umask(d_inode(path.dentry), mode); @@ -236,7 +237,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } diff --git a/fs/namei.c b/fs/namei.c index 4017bc8641d3..92973a7a8091 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2758,7 +2758,8 @@ static int filename_parentat(int dfd, struct filename *name, } /* does lookup, returns the object with parent locked */ -static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) +static struct dentry *__start_removing_path(int dfd, struct filename *name, + struct path *path) { struct path parent_path __free(path_put) = {}; struct dentry *d; @@ -2770,15 +2771,26 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); + /* don't fail immediately if it's r/o, at least try to report other errors */ + error = mnt_want_write(parent_path.mnt); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); - return d; - } + if (IS_ERR(d)) + goto unlock; + if (error) + goto fail; path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; + +fail: + dput(d); + d = ERR_PTR(error); +unlock: + inode_unlock(parent_path.dentry->d_inode); + if (!error) + mnt_drop_write(parent_path.mnt); + return d; } /** @@ -2816,24 +2828,26 @@ struct dentry *kern_path_parent(const char *name, struct path *path) return d; } -struct dentry *kern_path_locked(const char *name, struct path *path) +struct dentry *start_removing_path(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); - struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); + struct dentry *res = __start_removing_path(AT_FDCWD, filename, path); putname(filename); return res; } -struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) +struct dentry *start_removing_user_path_at(int dfd, + const char __user *name, + struct path *path) { struct filename *filename = getname(name); - struct dentry *res = __kern_path_locked(dfd, filename, path); + struct dentry *res = __start_removing_path(dfd, filename, path); putname(filename); return res; } -EXPORT_SYMBOL(user_path_locked_at); +EXPORT_SYMBOL(start_removing_user_path_at); int kern_path(const char *name, unsigned int flags, struct path *path) { @@ -4223,8 +4237,8 @@ out: return dentry; } -struct dentry *kern_path_create(int dfd, const char *pathname, - struct path *path, unsigned int lookup_flags) +struct dentry *start_creating_path(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4232,9 +4246,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, putname(filename); return res; } -EXPORT_SYMBOL(kern_path_create); +EXPORT_SYMBOL(start_creating_path); -void done_path_create(struct path *path, struct dentry *dentry) +void end_creating_path(struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); @@ -4242,10 +4256,11 @@ void done_path_create(struct path *path, struct dentry *dentry) mnt_drop_write(path->mnt); path_put(path); } -EXPORT_SYMBOL(done_path_create); +EXPORT_SYMBOL(end_creating_path); -inline struct dentry *user_path_create(int dfd, const char __user *pathname, - struct path *path, unsigned int lookup_flags) +inline struct dentry *start_creating_user_path( + int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4253,7 +4268,7 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, putname(filename); return res; } -EXPORT_SYMBOL(user_path_create); +EXPORT_SYMBOL(start_creating_user_path); /** * vfs_mknod - create device node or file @@ -4361,7 +4376,7 @@ retry: break; } out2: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4465,7 +4480,7 @@ retry: if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4819,7 +4834,7 @@ retry: if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4988,7 +5003,7 @@ retry: error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 8f732742b26e..267b50e8e42e 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4418,7 +4418,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, return error; } - new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); @@ -4435,7 +4435,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, d_inode(new_path.dentry), new_dentry, preserve); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 07739055ac9f..1cfa688904b2 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -196,7 +196,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) pr_err("File(%s): creation failed (err:%d)\n", name, err); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -237,7 +237,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) if (!err && dentry != d) ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); return err; @@ -669,7 +669,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, ksmbd_debug(VFS, "vfs_link failed err %d\n", err); out3: - done_path_create(&newpath, dentry); + end_creating_path(&newpath, dentry); out2: path_put(&oldpath); out1: @@ -1325,7 +1325,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, if (!abs_name) return ERR_PTR(-ENOMEM); - dent = kern_path_create(AT_FDCWD, abs_name, path, flags); + dent = start_creating_path(AT_FDCWD, abs_name, path, flags); kfree(abs_name); return dent; } diff --git a/include/linux/namei.h b/include/linux/namei.h index 1d5038c21c20..a7800ef04e76 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -59,11 +59,15 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, extern int kern_path(const char *, unsigned, struct path *); struct dentry *kern_path_parent(const char *name, struct path *parent); -extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); -extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); -extern void done_path_create(struct path *, struct dentry *); -extern struct dentry *kern_path_locked(const char *, struct path *); -extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); +extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int); +extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int); +extern void end_creating_path(struct path *, struct dentry *); +extern struct dentry *start_removing_path(const char *, struct path *); +extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *); +static inline void end_removing_path(struct path *path , struct dentry *dentry) +{ + end_creating_path(path, dentry); +} int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392..fadf3817a9c5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(path_fd, pathname, &path, 0); + dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, ret = -EPERM; } out: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return ret; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6d7c110814ff..768098dec231 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1387,7 +1387,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); + dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; @@ -1417,7 +1417,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); return 0; out_unlock: @@ -1427,7 +1427,7 @@ out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; -- cgit v1.2.3 From 159c86f306ea20c019b36fc998e2677008183c04 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Mon, 18 Aug 2025 09:39:11 +0530 Subject: ACPI: Add support for nargs_prop in acpi_fwnode_get_reference_args() Currently, ACPI does not support the use of a nargs_prop (e.g., associated with a reference in fwnode_property_get_reference_args(). Instead, ACPI expects the number of arguments (nargs) to be explicitly passed or known. This behavior diverges from Open Firmware (OF), which allows the use of a #*-cells property in the referenced node to determine the number of arguments. Since fwnode_property_get_reference_args() is a common interface used across both OF and ACPI firmware paradigms, it is desirable to have a unified calling convention that works seamlessly for both. Add the support for ACPI to parse a nargs_prop from the referenced fwnode, aligning its behavior with the OF backend. This allows drivers and subsystems using fwnode_property_get_reference_args() to work in a firmware-agnostic way without having to hardcode or special-case argument counts for ACPI. Acked-by: Rafael J. Wysocki Reviewed-by: Andy Shevchenko Signed-off-by: Sunil V L Signed-off-by: Anup Patel Acked-by: Jassi Brar Link: https://lore.kernel.org/r/20250818040920.272664-16-apatel@ventanamicro.com Signed-off-by: Paul Walmsley --- drivers/acpi/property.c | 29 +++++++++++++++++++++++++---- drivers/base/property.c | 2 +- 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'drivers/base') diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index d4863746fb11..e92402deee77 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -804,13 +804,35 @@ acpi_fwnode_get_named_child_node(const struct fwnode_handle *fwnode, return NULL; } +static unsigned int acpi_fwnode_get_args_count(struct fwnode_handle *fwnode, + const char *nargs_prop) +{ + const struct acpi_device_data *data; + const union acpi_object *obj; + int ret; + + data = acpi_device_data_of_node(fwnode); + if (!data) + return 0; + + ret = acpi_data_get_property(data, nargs_prop, ACPI_TYPE_INTEGER, &obj); + if (ret) + return 0; + + return obj->integer.value; +} + static int acpi_get_ref_args(struct fwnode_reference_args *args, struct fwnode_handle *ref_fwnode, + const char *nargs_prop, const union acpi_object **element, const union acpi_object *end, size_t num_args) { u32 nargs = 0, i; + if (nargs_prop) + num_args = acpi_fwnode_get_args_count(ref_fwnode, nargs_prop); + /* * Assume the following integer elements are all args. Stop counting on * the first reference (possibly represented as a string) or end of the @@ -961,10 +983,10 @@ static int acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode, return -EINVAL; element++; - ret = acpi_get_ref_args(idx == index ? args : NULL, acpi_fwnode_handle(device), - &element, end, args_count); + nargs_prop, &element, end, + args_count); if (ret < 0) return ret; @@ -979,9 +1001,8 @@ static int acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode, return -EINVAL; element++; - ret = acpi_get_ref_args(idx == index ? args : NULL, - ref_fwnode, &element, end, + ref_fwnode, nargs_prop, &element, end, args_count); if (ret < 0) return ret; diff --git a/drivers/base/property.c b/drivers/base/property.c index f626d5bbe806..6a63860579dd 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -578,7 +578,7 @@ EXPORT_SYMBOL_GPL(fwnode_property_match_property_string); * @prop: The name of the property * @nargs_prop: The name of the property telling the number of * arguments in the referred node. NULL if @nargs is known, - * otherwise @nargs is ignored. Only relevant on OF. + * otherwise @nargs is ignored. * @nargs: Number of arguments. Ignored if @nargs_prop is non-NULL. * @index: Index of the reference, from zero onwards. * @args: Result structure with reference and integer arguments. -- cgit v1.2.3 From 7f7acd193ba8aaa8ed07cfadc335bb17a991fd42 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 25 Sep 2025 12:42:14 -0700 Subject: PM: runtime: Add basic kunit tests for API contracts In exploring the various return codes and failure modes of runtime PM APIs, I found it helpful to verify and codify many of them in unit tests, especially given that even the kerneldoc can be rather complex to reason through, and it also has had subtle errors of its own. Notably, I avoid testing the return codes for pm_runtime_put() and pm_runtime_put_autosuspend(), since code that checks them is probably wrong, and we're considering making them return 'void' altogether. I still test the sync() variants, since those have a bit more meaning to them. Signed-off-by: Brian Norris Signed-off-by: Rafael J. Wysocki --- drivers/base/Kconfig | 6 + drivers/base/power/Makefile | 1 + drivers/base/power/runtime-test.c | 253 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 260 insertions(+) create mode 100644 drivers/base/power/runtime-test.c (limited to 'drivers/base') diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 064eb52ff7e2..1786d87b29e2 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -167,6 +167,12 @@ config PM_QOS_KUNIT_TEST depends on KUNIT=y default KUNIT_ALL_TESTS +config PM_RUNTIME_KUNIT_TEST + tristate "KUnit Tests for runtime PM" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on PM + default KUNIT_ALL_TESTS + config HMEM_REPORTING bool default n diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index 01f11629d241..2989e42d0161 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_PM_SLEEP) += main.o wakeup.o wakeup_stats.o obj-$(CONFIG_PM_TRACE_RTC) += trace.o obj-$(CONFIG_HAVE_CLK) += clock_ops.o obj-$(CONFIG_PM_QOS_KUNIT_TEST) += qos-test.o +obj-$(CONFIG_PM_RUNTIME_KUNIT_TEST) += runtime-test.o ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG diff --git a/drivers/base/power/runtime-test.c b/drivers/base/power/runtime-test.c new file mode 100644 index 000000000000..2e966fd96664 --- /dev/null +++ b/drivers/base/power/runtime-test.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2025 Google, Inc. + */ + +#include +#include +#include +#include + +#define DEVICE_NAME "pm_runtime_test_device" + +static void pm_runtime_depth_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_get_sync(dev)); /* "already active" */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +/* Test pm_runtime_put() and friends when already suspended. */ +static void pm_runtime_already_suspended_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + pm_runtime_get_noresume(dev); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_barrier(dev)); /* no wakeup needed */ + pm_runtime_put(dev); + + pm_runtime_get_noresume(dev); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, 1, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_request_autosuspend(dev)); + + pm_runtime_get_noresume(dev); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_put_sync_autosuspend(dev)); + + pm_runtime_get_noresume(dev); + pm_runtime_put_autosuspend(dev); + + /* Grab 2 refcounts */ + pm_runtime_get_noresume(dev); + pm_runtime_get_noresume(dev); + /* The first put() sees usage_count 1 */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync_autosuspend(dev)); + /* The second put() sees usage_count 0 but tells us "already suspended". */ + KUNIT_EXPECT_EQ(test, 1, pm_runtime_put_sync_autosuspend(dev)); + + /* Should have remained suspended the whole time. */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +static void pm_runtime_idle_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_idle(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + pm_runtime_put_noidle(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_idle(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_request_idle(dev)); +} + +static void pm_runtime_disabled_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + /* Never called pm_runtime_enable() */ + KUNIT_EXPECT_FALSE(test, pm_runtime_enabled(dev)); + + /* "disabled" is treated as "active" */ + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_FALSE(test, pm_runtime_suspended(dev)); + + /* + * Note: these "fail", but they still acquire/release refcounts, so + * keep them balanced. + */ + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_get(dev)); + pm_runtime_put(dev); + + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_get(dev)); + pm_runtime_put_autosuspend(dev); + + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_resume_and_get(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_request_idle(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_request_resume(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_request_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_resume(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_autosuspend(dev)); + + /* Still disabled */ + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_FALSE(test, pm_runtime_enabled(dev)); +} + +static void pm_runtime_error_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + /* Fake a .runtime_resume() error */ + dev->power.runtime_error = -EIO; + + /* + * Note: these "fail", but they still acquire/release refcounts, so + * keep them balanced. + */ + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get(dev)); + pm_runtime_put(dev); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get(dev)); + pm_runtime_put_autosuspend(dev); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_put_sync_autosuspend(dev)); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_resume_and_get(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_request_idle(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_request_resume(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_request_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_resume(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_autosuspend(dev)); + + /* Error is still pending */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, -EIO, dev->power.runtime_error); + /* Clear error */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_set_suspended(dev)); + KUNIT_EXPECT_EQ(test, 0, dev->power.runtime_error); + /* Still suspended */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_barrier(dev)); /* resume was pending */ + pm_runtime_put(dev); + pm_runtime_suspend(dev); /* flush the put(), to suspend */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + pm_runtime_put_autosuspend(dev); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_resume_and_get(dev)); + + /* + * The following should all return -EAGAIN (usage is non-zero) or 1 + * (already resumed). + */ + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_request_idle(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_request_resume(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_request_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_resume(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_autosuspend(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + + /* Suspended again */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +/* + * Explore a typical probe() sequence in which a device marks itself powered, + * but doesn't hold any runtime PM reference, so it suspends as soon as it goes + * idle. + */ +static void pm_runtime_probe_active_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + KUNIT_EXPECT_TRUE(test, pm_runtime_status_suspended(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_set_active(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + + pm_runtime_enable(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + + /* Nothing to flush. We stay active. */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_barrier(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + + /* Ask for idle? Now we suspend. */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_idle(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +static struct kunit_case pm_runtime_test_cases[] = { + KUNIT_CASE(pm_runtime_depth_test), + KUNIT_CASE(pm_runtime_already_suspended_test), + KUNIT_CASE(pm_runtime_idle_test), + KUNIT_CASE(pm_runtime_disabled_test), + KUNIT_CASE(pm_runtime_error_test), + KUNIT_CASE(pm_runtime_probe_active_test), + {} +}; + +static struct kunit_suite pm_runtime_test_suite = { + .name = "pm_runtime_test_cases", + .test_cases = pm_runtime_test_cases, +}; + +kunit_test_suite(pm_runtime_test_suite); +MODULE_DESCRIPTION("Runtime power management unit test suite"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From d0b8651a026125d58b50b464aeb78f2c5956179f Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 25 Sep 2025 12:42:15 -0700 Subject: PM: runtime: Make put{,_sync}() return 1 when already suspended The pm_runtime.h docs say pm_runtime_put() and pm_runtime_put_sync() return 1 when already suspended, but this is not true -- they return -EAGAIN. On the other hand, pm_runtime_put_sync_suspend() and pm_runtime_put_sync_autosuspend() *do* return 1. This is an artifact of the fact that the former are built on rpm_idle(), whereas the latter are built on rpm_suspend(). There are precious few pm_runtime_put()/pm_runtime_put_sync() callers that check the return code at all, but most of them only log errors, and usually only for negative error codes. None of them should be treating this as an error, so: * at best, this may fix some case where a driver treats this condition as an error, when it shouldn't; * at worst, this should make no effect; and * somewhere in between, we could potentially clear up non-fatal log messages. Fix the pm_runtime_already_suspended_test() while tweaking the behavior. The test makes a lot more sense when these all return 1 when the device is already suspended: pm_runtime_put_sync(dev); pm_runtime_suspend(dev); pm_runtime_autosuspend(dev); pm_request_autosuspend(dev); pm_runtime_put_sync_autosuspend(dev); Notably, I've avoided testing the return codes for these, since they really should be ignored by callers, and we may make them 'void' altogether: pm_runtime_put(dev); pm_runtime_put_autosuspend(dev); Signed-off-by: Brian Norris Reviewed-by: Dhruva Gole Reviewed-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime-test.c | 2 +- drivers/base/power/runtime.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/power/runtime-test.c b/drivers/base/power/runtime-test.c index 2e966fd96664..eca9885e807d 100644 --- a/drivers/base/power/runtime-test.c +++ b/drivers/base/power/runtime-test.c @@ -42,7 +42,7 @@ static void pm_runtime_already_suspended_test(struct kunit *test) pm_runtime_put(dev); pm_runtime_get_noresume(dev); - KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_put_sync(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_put_sync(dev)); KUNIT_EXPECT_EQ(test, 1, pm_runtime_suspend(dev)); KUNIT_EXPECT_EQ(test, 1, pm_runtime_autosuspend(dev)); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 3e84dc4122de..faa68bf9ef3d 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -498,6 +498,9 @@ static int rpm_idle(struct device *dev, int rpmflags) if (retval < 0) ; /* Conditions are wrong. */ + else if ((rpmflags & RPM_GET_PUT) && retval == 1) + ; /* put() is allowed in RPM_SUSPENDED */ + /* Idle notifications are allowed only in the RPM_ACTIVE state. */ else if (dev->power.runtime_status != RPM_ACTIVE) retval = -EAGAIN; -- cgit v1.2.3 From 632d31067be2f414c57955efcf29c79290cc749b Mon Sep 17 00:00:00 2001 From: Pin-yen Lin Date: Fri, 26 Sep 2025 18:23:18 +0800 Subject: PM: sleep: Do not wait on SYNC_STATE_ONLY device links Device links with DL_FLAG_SYNC_STATE_ONLY should not affect system suspend and resume, and functions like device_reorder_to_tail() and device_link_add() don't try to reorder the consumers with that flag. However, dpm_wait_for_consumers() and dpm_wait_for_suppliers() don't check thas flag before triggering dpm_wait(), leading to potential hang during suspend/resume. This can be reproduced on MT8186 Corsola Chromebook with devicetree like: usb-a-connector { compatible = "usb-a-connector"; port { usb_a_con: endpoint { remote-endpoint = <&usb_hs>; }; }; }; usb_host { compatible = "mediatek,mt8186-xhci", "mediatek,mtk-xhci"; port { usb_hs: endpoint { remote-endpoint = <&usb_a_con>; }; }; }; In this case, the two nodes form a cycle and a SYNC_STATE_ONLY devlink between usb_host (supplier) and usb-a-connector (consumer) is created. Address this by exporting device_link_flag_is_sync_state_only() and making dpm_wait_for_consumers() and dpm_wait_for_suppliers() use it when deciding if dpm_wait() should be called. Fixes: 05ef983e0d65a ("driver core: Add device link support for SYNC_STATE_ONLY flag") Signed-off-by: Pin-yen Lin Link: https://patch.msgid.link/20250926102320.4053167-1-treapking@chromium.org [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/base/base.h | 1 + drivers/base/core.c | 2 +- drivers/base/power/main.c | 6 ++++-- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/base.h b/drivers/base/base.h index 700aecd22fd3..86fa7fbb3548 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -248,6 +248,7 @@ void device_links_driver_cleanup(struct device *dev); void device_links_no_driver(struct device *dev); bool device_links_busy(struct device *dev); void device_links_unbind_consumers(struct device *dev); +bool device_link_flag_is_sync_state_only(u32 flags); void fw_devlink_drivers_done(void); void fw_devlink_probing_done(void); diff --git a/drivers/base/core.c b/drivers/base/core.c index d22d6b23e758..a54ec6df1058 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -287,7 +287,7 @@ static bool device_is_ancestor(struct device *dev, struct device *target) #define DL_MARKER_FLAGS (DL_FLAG_INFERRED | \ DL_FLAG_CYCLE | \ DL_FLAG_MANAGED) -static inline bool device_link_flag_is_sync_state_only(u32 flags) +bool device_link_flag_is_sync_state_only(u32 flags) { return (flags & ~DL_MARKER_FLAGS) == DL_FLAG_SYNC_STATE_ONLY; } diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e3a7f25bf8d3..c97f8b139dc2 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -278,7 +278,8 @@ static void dpm_wait_for_suppliers(struct device *dev, bool async) * walking. */ dev_for_each_link_to_supplier(link, dev) - if (READ_ONCE(link->status) != DL_STATE_DORMANT) + if (READ_ONCE(link->status) != DL_STATE_DORMANT && + !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->supplier, async); device_links_read_unlock(idx); @@ -335,7 +336,8 @@ static void dpm_wait_for_consumers(struct device *dev, bool async) * unregistration). */ dev_for_each_link_to_consumer(link, dev) - if (READ_ONCE(link->status) != DL_STATE_DORMANT) + if (READ_ONCE(link->status) != DL_STATE_DORMANT && + !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->consumer, async); device_links_read_unlock(idx); -- cgit v1.2.3 From 0efdedfa537eb534c251a5b4794caaf72cc55869 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Thu, 18 Sep 2025 11:11:44 +0530 Subject: drivers/base/node: fix double free in register_one_node() When device_register() fails in register_node(), it calls put_device(&node->dev). This triggers node_device_release(), which calls kfree(to_node(dev)), thereby freeing the entire node structure. As a result, when register_node() returns an error, the node memory has already been freed. Calling kfree(node) again in register_one_node() leads to a double free. This patch removes the redundant kfree(node) from register_one_node() to prevent the double free. Link: https://lkml.kernel.org/r/20250918054144.58980-1-donettom@linux.ibm.com Fixes: 786eb990cfb7 ("drivers/base/node: handle error properly in register_one_node()") Signed-off-by: Donet Tom Acked-by: David Hildenbrand Acked-by: Oscar Salvador Cc: Alison Schofield Cc: Chris Mason Cc: Danilo Krummrich Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Hiroyouki Kamezawa Cc: Joanthan Cameron Cc: "Ritesh Harjani (IBM)" Cc: Yury Norov (NVIDIA) Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/base/node.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/node.c b/drivers/base/node.c index 45d512939c40..67b01d579737 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -887,7 +887,6 @@ int register_one_node(int nid) error = register_node(node_devices[nid], nid); if (error) { node_devices[nid] = NULL; - kfree(node); return error; } -- cgit v1.2.3 From 9a0abc39450a3123fd52533a662fbd37e0d1508c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 17:47:14 +0200 Subject: PM: runtime: Add auto-cleanup macros for "resume and get" operations It is generally useful to be able to automatically drop a device's runtime PM usage counter incremented by runtime PM operations that resume a device and bump up its usage counter [1]. To that end, add guard definition macros allowing pm_runtime_put() and pm_runtime_put_autosuspend() to be used for the auto-cleanup in those cases. Simply put, a piece of code like below: pm_runtime_get_sync(dev); ..... pm_runtime_put(dev); return 0; can be transformed with guard() like: guard(pm_runtime_active)(dev); ..... return 0; (see the pm_runtime_put() call is gone). However, it is better to do proper error handling in the majority of cases, so doing something like this instead of the above is recommended: ACQUIRE(pm_runtime_active_try, pm)(dev); if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) return -ENXIO; ..... return 0; In all of the cases in which runtime PM is known to be enabled for the given device or the device can be regarded as operational (and so it can be accessed) with runtime PM disabled, a piece of code like: ret = pm_runtime_resume_and_get(dev); if (ret < 0) return ret; ..... pm_runtime_put(dev); return 0; can be changed as follows: ACQUIRE(pm_runtime_active_try, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_try, &pm); if (ret < 0) return ret; ..... return 0; (again, see the pm_runtime_put() call is gone). Still, if the device cannot be accessed unless runtime PM has been enabled for it, the pm_runtime_active_try_enabled guard variant needs to be used, that is (in the context of the example above): ACQUIRE(pm_runtime_active_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; When the original code calls pm_runtime_put_autosuspend(), use one of the "auto" guard variants, pm_runtime_active_auto/_try/_enabled, so for example, a piece of code like: ret = pm_runtime_resume_and_get(dev); if (ret < 0) return ret; ..... pm_runtime_put_autosuspend(dev); return 0; will become: ACQUIRE(pm_runtime_active_auto_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_auto_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; Note that the cases in which the return value of pm_runtime_get_sync() is checked can also be handled with the help of the new guard macros. For example, a piece of code like: ret = pm_runtime_get_sync(dev); if (ret < 0) { pm_runtime_put(dev); return ret; } ..... pm_runtime_put(dev); return 0; can be rewritten as: ACQUIRE(pm_runtime_active_auto_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_auto_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; or pm_runtime_get_active_try can be used if transparent handling of disabled runtime PM is desirable. Link: https://lore.kernel.org/linux-pm/878qimv24u.wl-tiwai@suse.de/ [1] Link: https://lore.kernel.org/linux-pm/20250926150613.000073a4@huawei.com/ Signed-off-by: Rafael J. Wysocki Acked-by: Dan Williams Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/2238241.irdbgypaU6@rafael.j.wysocki [ rjw: Fixed leftovers from the previous version in the changelog ] Reviewed-by: Jonathan Cameron Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 2 ++ include/linux/pm_runtime.h | 44 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 37 insertions(+), 9 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index faa68bf9ef3d..0fc825066ede 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -799,6 +799,8 @@ static int rpm_resume(struct device *dev, int rpmflags) if (dev->power.runtime_status == RPM_ACTIVE && dev->power.last_status == RPM_ACTIVE) retval = 1; + else if (rpmflags & RPM_TRANSPARENT) + goto out; else retval = -EACCES; } diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d1ff76e0e2d0..e5426bdd0c9f 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -21,6 +21,7 @@ #define RPM_GET_PUT 0x04 /* Increment/decrement the usage_count */ #define RPM_AUTO 0x08 /* Use autosuspend_delay */ +#define RPM_TRANSPARENT 0x10 /* Succeed if runtime PM is disabled */ /* * Use this for defining a set of PM operations to be used in all situations @@ -512,6 +513,19 @@ static inline int pm_runtime_get_sync(struct device *dev) return __pm_runtime_resume(dev, RPM_GET_PUT); } +static inline int pm_runtime_get_active(struct device *dev, int rpmflags) +{ + int ret; + + ret = __pm_runtime_resume(dev, RPM_GET_PUT | rpmflags); + if (ret < 0) { + pm_runtime_put_noidle(dev); + return ret; + } + + return 0; +} + /** * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. * @dev: Target device. @@ -522,15 +536,7 @@ static inline int pm_runtime_get_sync(struct device *dev) */ static inline int pm_runtime_resume_and_get(struct device *dev) { - int ret; - - ret = __pm_runtime_resume(dev, RPM_GET_PUT); - if (ret < 0) { - pm_runtime_put_noidle(dev); - return ret; - } - - return 0; + return pm_runtime_get_active(dev, 0); } /** @@ -610,6 +616,26 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) return __pm_runtime_put_autosuspend(dev); } +DEFINE_GUARD(pm_runtime_active, struct device *, + pm_runtime_get_sync(_T), pm_runtime_put(_T)); +DEFINE_GUARD(pm_runtime_active_auto, struct device *, + pm_runtime_get_sync(_T), pm_runtime_put_autosuspend(_T)); +/* + * Use the following guards with ACQUIRE()/ACQUIRE_ERR(). + * + * The difference between the "_try" and "_try_enabled" variants is that the + * former do not produce an error when runtime PM is disabled for the given + * device. + */ +DEFINE_GUARD_COND(pm_runtime_active, _try, + pm_runtime_get_active(_T, RPM_TRANSPARENT)) +DEFINE_GUARD_COND(pm_runtime_active, _try_enabled, + pm_runtime_resume_and_get(_T)) +DEFINE_GUARD_COND(pm_runtime_active_auto, _try, + pm_runtime_get_active(_T, RPM_TRANSPARENT)) +DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, + pm_runtime_resume_and_get(_T)) + /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. -- cgit v1.2.3 From 92158fae2ed986f44347fc5b9a269830862c1529 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 3 Oct 2025 12:29:31 +0300 Subject: PM: runtime: Fix error checking for kunit_device_register() The kunit_device_register() function never returns NULL, it returns error pointers. Update the assertions to use KUNIT_ASSERT_NOT_ERR_OR_NULL() instead of checking for NULL. Fixes: 7f7acd193ba8 ("PM: runtime: Add basic kunit tests for API contracts") Signed-off-by: Dan Carpenter Reviewed-by: Brian Norris Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime-test.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/runtime-test.c b/drivers/base/power/runtime-test.c index eca9885e807d..477feca804c7 100644 --- a/drivers/base/power/runtime-test.c +++ b/drivers/base/power/runtime-test.c @@ -14,7 +14,7 @@ static void pm_runtime_depth_test(struct kunit *test) { struct device *dev = kunit_device_register(test, DEVICE_NAME); - KUNIT_ASSERT_PTR_NE(test, NULL, dev); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); pm_runtime_enable(dev); @@ -32,7 +32,7 @@ static void pm_runtime_already_suspended_test(struct kunit *test) { struct device *dev = kunit_device_register(test, DEVICE_NAME); - KUNIT_ASSERT_PTR_NE(test, NULL, dev); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); pm_runtime_enable(dev); KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); @@ -70,7 +70,7 @@ static void pm_runtime_idle_test(struct kunit *test) { struct device *dev = kunit_device_register(test, DEVICE_NAME); - KUNIT_ASSERT_PTR_NE(test, NULL, dev); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); pm_runtime_enable(dev); @@ -91,7 +91,7 @@ static void pm_runtime_disabled_test(struct kunit *test) { struct device *dev = kunit_device_register(test, DEVICE_NAME); - KUNIT_ASSERT_PTR_NE(test, NULL, dev); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); /* Never called pm_runtime_enable() */ KUNIT_EXPECT_FALSE(test, pm_runtime_enabled(dev)); @@ -131,7 +131,7 @@ static void pm_runtime_error_test(struct kunit *test) { struct device *dev = kunit_device_register(test, DEVICE_NAME); - KUNIT_ASSERT_PTR_NE(test, NULL, dev); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); pm_runtime_enable(dev); KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); @@ -214,7 +214,7 @@ static void pm_runtime_probe_active_test(struct kunit *test) { struct device *dev = kunit_device_register(test, DEVICE_NAME); - KUNIT_ASSERT_PTR_NE(test, NULL, dev); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); KUNIT_EXPECT_TRUE(test, pm_runtime_status_suspended(dev)); -- cgit v1.2.3 From c6a809363a66b8ff0f6a000b5f09408a1b33eeb5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 29 Jul 2025 08:46:34 +0200 Subject: drivers/base/memory: add node id parameter to add_memory_block() Patch series "mm/memory_hotplug: fixup crash during uevent handling", v4. we have some udev rules trying to read the sysfs attribute 'valid_zones' during an memory 'add' event, causing a crash in zone_for_pfn_range(). Debugging found that mem->nid was set to NUMA_NO_NODE, which crashed in NODE_DATA(nid). Further analysis revealed that we're running into a race with udev event processing: add_memory_resource() has this function calls: 1) __try_online_node() 2) arch_add_memory() 3) create_memory_block_devices() -> calls device_register() -> memory 'add' event 4) node_set_online()/__register_one_node() -> calls device_register() -> node 'add' event 5) register_memory_blocks_under_node() -> sets mem->nid Which, to the uninitated, is ... weird ... Why do we try to online the node in 1), but only register the node in 4) _after_ we have created the memory blocks in 3) ? And why do we set the 'nid' value in 5), when the uevent (which might need to see the correct 'nid' value) is sent out in 3) ? There must be a reason, I'm sure ... So here's a small patchset to fixup uevent ordering. The first patch adds a 'nid' parameter to add_memory_blocks() (to avoid mem->nid being initialized with NUMA_NO_NODE), and the second patch reshuffles the code in add_memory_resource() to fully initialize the node prior to calling create_memory_block_devices() so that the node is valid at that time and uevent processing will see correct values in sysfs. This patch (of 3): We have some udev rules trying to read the sysfs attribute 'valid_zones' during an memory 'add' event, causing a crash in zone_for_pfn_range(). Debugging found that mem->nid was set to NUMA_NO_NODE, which crashed in NODE_DATA(nid). Further analysis revealed that we're running into a race with udev event processing: add_memory_resource() has this function calls: 1) __try_online_node() 2) arch_add_memory() 3) create_memory_block_devices() -> calls device_register() -> memory 'add' event 4) node_set_online()/__register_one_node() -> calls device_register() -> node 'add' event 5) register_memory_blocks_under_node() -> sets mem->nid Which, to the uninitated, is ... weird ... Why do we try to online the node in 1), but only register the node in 4) _after_ we have created the memory blocks in 3) ? And why do we set the 'nid' value in 5), when the uevent (which might need to see the correct 'nid' value) is sent out in 3) ? There must be a reason, I'm sure ... So here's a small patchset to fixup uevent ordering. The first patch adds a 'nid' parameter to add_memory_blocks() (to avoid mem->nid being initialized with NUMA_NO_NODE), and the second patch reshuffles the code in add_memory_resource() to fully initialize the node prior to calling create_memory_block_devices() so that the node is valid at that time and uevent processing will see correct values in sysfs. This patch (of 3): Add a 'nid' parameter to add_memory_block() to initialize the memory block with the correct node id. Link: https://lkml.kernel.org/r/20250729064637.51662-1-hare@kernel.org Link: https://lkml.kernel.org/r/20250729064637.51662-2-hare@kernel.org Signed-off-by: Hannes Reinecke Acked-by: David Hildenbrand Acked-by: Oscar Salvador Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- drivers/base/memory.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 5c6c1d6bb59f..894d3891292b 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -809,7 +809,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid, } #endif -static int add_memory_block(unsigned long block_id, unsigned long state, +static int add_memory_block(unsigned long block_id, int nid, unsigned long state, struct vmem_altmap *altmap, struct memory_group *group) { @@ -827,7 +827,7 @@ static int add_memory_block(unsigned long block_id, unsigned long state, mem->start_section_nr = block_id * sections_per_block; mem->state = state; - mem->nid = NUMA_NO_NODE; + mem->nid = nid; mem->altmap = altmap; INIT_LIST_HEAD(&mem->group_next); @@ -854,13 +854,6 @@ static int add_memory_block(unsigned long block_id, unsigned long state, return 0; } -static int add_hotplug_memory_block(unsigned long block_id, - struct vmem_altmap *altmap, - struct memory_group *group) -{ - return add_memory_block(block_id, MEM_OFFLINE, altmap, group); -} - static void remove_memory_block(struct memory_block *memory) { if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) @@ -900,7 +893,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = add_hotplug_memory_block(block_id, altmap, group); + ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_OFFLINE, altmap, group); if (ret) break; } @@ -1005,7 +998,7 @@ void __init memory_dev_init(void) continue; block_id = memory_block_id(nr); - ret = add_memory_block(block_id, MEM_ONLINE, NULL, NULL); + ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_ONLINE, NULL, NULL); if (ret) { panic("%s() failed to add memory block: %d\n", __func__, ret); -- cgit v1.2.3 From b8179af120943e2fc099ea87caa234039a709a66 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 29 Jul 2025 08:46:35 +0200 Subject: mm/memory_hotplug: activate node before adding new memory blocks The sysfs attributes for memory blocks require the node ID to be set and initialized, so move the node activation before adding new memory blocks. This also has the nice side effect that the BUG_ON() can be converted into a WARN_ON() as we now can handle registration errors. Link: https://lkml.kernel.org/r/20250729064637.51662-3-hare@kernel.org Fixes: b9ff036082cd ("mm/memory_hotplug.c: make add_memory_resource use __try_online_node") Signed-off-by: Hannes Reinecke Acked-by: David Hildenbrand Acked-by: Oscar Salvador Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 2 +- mm/memory_hotplug.c | 32 +++++++++++++++++--------------- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 894d3891292b..fb212a889e65 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -879,7 +879,7 @@ static void remove_memory_block(struct memory_block *memory) * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size, - struct vmem_altmap *altmap, + int nid, struct vmem_altmap *altmap, struct memory_group *group) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); @@ -893,7 +893,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_OFFLINE, altmap, group); + ret = add_memory_block(block_id, nid, MEM_OFFLINE, altmap, group); if (ret) break; } diff --git a/include/linux/memory.h b/include/linux/memory.h index 40eb70ccb09d..4a29153e372e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -159,7 +159,7 @@ static inline unsigned long memory_block_advised_max_size(void) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - struct vmem_altmap *altmap, + int nid, struct vmem_altmap *altmap, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e9f14de4a9c9..0be83039c3b5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1477,7 +1477,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, } /* create memory block devices after memory was added */ - ret = create_memory_block_devices(cur_start, memblock_size, + ret = create_memory_block_devices(cur_start, memblock_size, nid, params.altmap, group); if (ret) { arch_remove_memory(cur_start, memblock_size, NULL); @@ -1539,8 +1539,16 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) ret = __try_online_node(nid, false); if (ret < 0) - goto error; - new_node = ret; + goto error_memblock_remove; + if (ret) { + node_set_online(nid); + ret = register_one_node(nid); + if (WARN_ON(ret)) { + node_set_offline(nid); + goto error_memblock_remove; + } + new_node = true; + } /* * Self hosted memmap array @@ -1556,24 +1564,13 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, NULL, group); + ret = create_memory_block_devices(start, size, nid, NULL, group); if (ret) { arch_remove_memory(start, size, params.altmap); goto error; } } - if (new_node) { - /* If sysfs file of new node can't be created, cpu on the node - * can't be hot-added. There is no rollback way now. - * So, check by BUG_ON() to catch it reluctantly.. - * We online node here. We can't roll back from here. - */ - node_set_online(nid); - ret = register_one_node(nid); - BUG_ON(ret); - } - register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); @@ -1597,6 +1594,11 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) return ret; error: + if (new_node) { + node_set_offline(nid); + unregister_one_node(nid); + } +error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); error_mem_hotplug_end: -- cgit v1.2.3 From 0a947c14e48cbf9de222836170282e0167a9e096 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 29 Jul 2025 08:46:36 +0200 Subject: drivers/base: move memory_block_add_nid() into the caller Now the node id only needs to be set for early memory, so move memory_block_add_nid() into the caller and rename it into memory_block_add_nid_early(). This allows us to further simplify the code by dropping the 'context' argument to do_register_memory_block_under_node(). Link: https://lkml.kernel.org/r/20250729064637.51662-4-hare@kernel.org Suggested-by: David Hildenbrand Signed-off-by: Hannes Reinecke Acked-by: David Hildenbrand Acked-by: Oscar Salvador Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- drivers/base/memory.c | 36 ++++++++++++++++++------------------ drivers/base/node.c | 10 ++++------ include/linux/memory.h | 3 +-- 3 files changed, 23 insertions(+), 26 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index fb212a889e65..6d84a02cfa5d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -769,21 +769,22 @@ static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, #ifdef CONFIG_NUMA /** - * memory_block_add_nid() - Indicate that system RAM falling into this memory - * block device (partially) belongs to the given node. + * memory_block_add_nid_early() - Indicate that early system RAM falling into + * this memory block device (partially) belongs + * to the given node. * @mem: The memory block device. * @nid: The node id. - * @context: The memory initialization context. * - * Indicate that system RAM falling into this memory block (partially) belongs - * to the given node. If the context indicates ("early") that we are adding the - * node during node device subsystem initialization, this will also properly - * set/adjust mem->zone based on the zone ranges of the given node. + * Indicate that early system RAM falling into this memory block (partially) + * belongs to the given node. This will also properly set/adjust mem->zone based + * on the zone ranges of the given node. + * + * Memory hotplug handles this on memory block creation, where we can only have + * a single nid span a memory block. */ -void memory_block_add_nid(struct memory_block *mem, int nid, - enum meminit_context context) +void memory_block_add_nid_early(struct memory_block *mem, int nid) { - if (context == MEMINIT_EARLY && mem->nid != nid) { + if (mem->nid != nid) { /* * For early memory we have to determine the zone when setting * the node id and handle multiple nodes spanning a single @@ -797,15 +798,14 @@ void memory_block_add_nid(struct memory_block *mem, int nid, mem->zone = early_node_zone_for_memory_block(mem, nid); else mem->zone = NULL; + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. If we span multiple nodes (not applicable + * to hotplugged memory), zone == NULL will prohibit memory offlining + * and consequently unplug. + */ + mem->nid = nid; } - - /* - * If this memory block spans multiple nodes, we only indicate - * the last processed node. If we span multiple nodes (not applicable - * to hotplugged memory), zone == NULL will prohibit memory offlining - * and consequently unplug. - */ - mem->nid = nid; } #endif diff --git a/drivers/base/node.c b/drivers/base/node.c index 67b01d579737..6b6e55a98b79 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -781,13 +781,10 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG static void do_register_memory_block_under_node(int nid, - struct memory_block *mem_blk, - enum meminit_context context) + struct memory_block *mem_blk) { int ret; - memory_block_add_nid(mem_blk, nid, context); - ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, kobject_name(&mem_blk->dev.kobj)); @@ -815,7 +812,7 @@ static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, { int nid = *(int *)arg; - do_register_memory_block_under_node(nid, mem_blk, MEMINIT_HOTPLUG); + do_register_memory_block_under_node(nid, mem_blk); return 0; } @@ -855,7 +852,8 @@ static void register_memory_blocks_under_nodes(void) if (!mem) continue; - do_register_memory_block_under_node(nid, mem, MEMINIT_EARLY); + memory_block_add_nid_early(mem, nid); + do_register_memory_block_under_node(nid, mem); put_device(&mem->dev); } diff --git a/include/linux/memory.h b/include/linux/memory.h index 4a29153e372e..43d378038ce2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -202,8 +202,7 @@ static inline unsigned long phys_to_block_id(unsigned long phys) } #ifdef CONFIG_NUMA -void memory_block_add_nid(struct memory_block *mem, int nid, - enum meminit_context context); +void memory_block_add_nid_early(struct memory_block *mem, int nid); #endif /* CONFIG_NUMA */ int memory_block_advise_max_size(unsigned long size); unsigned long memory_block_advised_max_size(void); -- cgit v1.2.3 From 74b84d1be0220b99405c16a4a3e1e503e3bd8387 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 7 Oct 2025 11:43:12 +0200 Subject: driver core: fw_devlink: Don't warn about sync_state() pending Due to the wider deployment of the ->sync_state() support, for PM domains for example, we are receiving reports about the sync_state() pending message that is being logged in fw_devlink_dev_sync_state(). In particular as it's printed at the warning level, which is questionable. Even if it certainly is useful to know that the ->sync_state() condition could not be met, there may be nothing wrong with it. For example, a driver may be built as module and are still waiting to be initialized/probed. For this reason let's move to the info level for now. Reported-by: Geert Uytterhoeven Reported-by: Sebin Francis Reported-by: Diederik de Haas Reported-by: Jon Hunter Reviewed-by: Tomi Valkeinen Signed-off-by: Ulf Hansson Reviewed-by: Dhruva Gole Reviewed-by: Sebastian Reichel Reviewed-by: Kevin Hilman Acked-by: Saravana Kannan Reviewed-by: Sebin Francis Tested-by: Sebin Francis Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index 3c533dab8fa5..f69dc9c85954 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1784,7 +1784,7 @@ static int fw_devlink_dev_sync_state(struct device *dev, void *data) return 0; if (fw_devlink_sync_state == FW_DEVLINK_SYNC_STATE_STRICT) { - dev_warn(sup, "sync_state() pending due to %s\n", + dev_info(sup, "sync_state() pending due to %s\n", dev_name(link->consumer)); return 0; } -- cgit v1.2.3 From a91c8096590bd7801a26454789f2992094fe36da Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 23 Jul 2025 16:24:16 +0200 Subject: devcoredump: Fix circular locking dependency with devcd->mutex. The original code causes a circular locking dependency found by lockdep. ====================================================== WARNING: possible circular locking dependency detected 6.16.0-rc6-lgci-xe-xe-pw-151626v3+ #1 Tainted: G S U ------------------------------------------------------ xe_fault_inject/5091 is trying to acquire lock: ffff888156815688 ((work_completion)(&(&devcd->del_wk)->work)){+.+.}-{0:0}, at: __flush_work+0x25d/0x660 but task is already holding lock: ffff888156815620 (&devcd->mutex){+.+.}-{3:3}, at: dev_coredump_put+0x3f/0xa0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&devcd->mutex){+.+.}-{3:3}: mutex_lock_nested+0x4e/0xc0 devcd_data_write+0x27/0x90 sysfs_kf_bin_write+0x80/0xf0 kernfs_fop_write_iter+0x169/0x220 vfs_write+0x293/0x560 ksys_write+0x72/0xf0 __x64_sys_write+0x19/0x30 x64_sys_call+0x2bf/0x2660 do_syscall_64+0x93/0xb60 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #1 (kn->active#236){++++}-{0:0}: kernfs_drain+0x1e2/0x200 __kernfs_remove+0xae/0x400 kernfs_remove_by_name_ns+0x5d/0xc0 remove_files+0x54/0x70 sysfs_remove_group+0x3d/0xa0 sysfs_remove_groups+0x2e/0x60 device_remove_attrs+0xc7/0x100 device_del+0x15d/0x3b0 devcd_del+0x19/0x30 process_one_work+0x22b/0x6f0 worker_thread+0x1e8/0x3d0 kthread+0x11c/0x250 ret_from_fork+0x26c/0x2e0 ret_from_fork_asm+0x1a/0x30 -> #0 ((work_completion)(&(&devcd->del_wk)->work)){+.+.}-{0:0}: __lock_acquire+0x1661/0x2860 lock_acquire+0xc4/0x2f0 __flush_work+0x27a/0x660 flush_delayed_work+0x5d/0xa0 dev_coredump_put+0x63/0xa0 xe_driver_devcoredump_fini+0x12/0x20 [xe] devm_action_release+0x12/0x30 release_nodes+0x3a/0x120 devres_release_all+0x8a/0xd0 device_unbind_cleanup+0x12/0x80 device_release_driver_internal+0x23a/0x280 device_driver_detach+0x14/0x20 unbind_store+0xaf/0xc0 drv_attr_store+0x21/0x50 sysfs_kf_write+0x4a/0x80 kernfs_fop_write_iter+0x169/0x220 vfs_write+0x293/0x560 ksys_write+0x72/0xf0 __x64_sys_write+0x19/0x30 x64_sys_call+0x2bf/0x2660 do_syscall_64+0x93/0xb60 entry_SYSCALL_64_after_hwframe+0x76/0x7e other info that might help us debug this: Chain exists of: (work_completion)(&(&devcd->del_wk)->work) --> kn->active#236 --> &devcd->mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&devcd->mutex); lock(kn->active#236); lock(&devcd->mutex); lock((work_completion)(&(&devcd->del_wk)->work)); *** DEADLOCK *** 5 locks held by xe_fault_inject/5091: #0: ffff8881129f9488 (sb_writers#5){.+.+}-{0:0}, at: ksys_write+0x72/0xf0 #1: ffff88810c755078 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x123/0x220 #2: ffff8881054811a0 (&dev->mutex){....}-{3:3}, at: device_release_driver_internal+0x55/0x280 #3: ffff888156815620 (&devcd->mutex){+.+.}-{3:3}, at: dev_coredump_put+0x3f/0xa0 #4: ffffffff8359e020 (rcu_read_lock){....}-{1:2}, at: __flush_work+0x72/0x660 stack backtrace: CPU: 14 UID: 0 PID: 5091 Comm: xe_fault_inject Tainted: G S U 6.16.0-rc6-lgci-xe-xe-pw-151626v3+ #1 PREEMPT_{RT,(lazy)} Tainted: [S]=CPU_OUT_OF_SPEC, [U]=USER Hardware name: Micro-Star International Co., Ltd. MS-7D25/PRO Z690-A DDR4(MS-7D25), BIOS 1.10 12/13/2021 Call Trace: dump_stack_lvl+0x91/0xf0 dump_stack+0x10/0x20 print_circular_bug+0x285/0x360 check_noncircular+0x135/0x150 ? register_lock_class+0x48/0x4a0 __lock_acquire+0x1661/0x2860 lock_acquire+0xc4/0x2f0 ? __flush_work+0x25d/0x660 ? mark_held_locks+0x46/0x90 ? __flush_work+0x25d/0x660 __flush_work+0x27a/0x660 ? __flush_work+0x25d/0x660 ? trace_hardirqs_on+0x1e/0xd0 ? __pfx_wq_barrier_func+0x10/0x10 flush_delayed_work+0x5d/0xa0 dev_coredump_put+0x63/0xa0 xe_driver_devcoredump_fini+0x12/0x20 [xe] devm_action_release+0x12/0x30 release_nodes+0x3a/0x120 devres_release_all+0x8a/0xd0 device_unbind_cleanup+0x12/0x80 device_release_driver_internal+0x23a/0x280 ? bus_find_device+0xa8/0xe0 device_driver_detach+0x14/0x20 unbind_store+0xaf/0xc0 drv_attr_store+0x21/0x50 sysfs_kf_write+0x4a/0x80 kernfs_fop_write_iter+0x169/0x220 vfs_write+0x293/0x560 ksys_write+0x72/0xf0 __x64_sys_write+0x19/0x30 x64_sys_call+0x2bf/0x2660 do_syscall_64+0x93/0xb60 ? __f_unlock_pos+0x15/0x20 ? __x64_sys_getdents64+0x9b/0x130 ? __pfx_filldir64+0x10/0x10 ? do_syscall_64+0x1a2/0xb60 ? clear_bhb_loop+0x30/0x80 ? clear_bhb_loop+0x30/0x80 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x76e292edd574 Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d d5 ea 0e 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89 RSP: 002b:00007fffe247a828 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 000076e292edd574 RDX: 000000000000000c RSI: 00006267f6306063 RDI: 000000000000000b RBP: 000000000000000c R08: 000076e292fc4b20 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000202 R12: 00006267f6306063 R13: 000000000000000b R14: 00006267e6859c00 R15: 000076e29322a000 xe 0000:03:00.0: [drm] Xe device coredump has been deleted. Fixes: 01daccf74832 ("devcoredump : Serialize devcd_del work") Cc: Mukesh Ojha Cc: Greg Kroah-Hartman Cc: Johannes Berg Cc: Rafael J. Wysocki Cc: Danilo Krummrich Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: Maarten Lankhorst Cc: Matthew Brost Acked-by: Mukesh Ojha Link: https://lore.kernel.org/r/20250723142416.1020423-1-dev@lankhorst.se Signed-off-by: Greg Kroah-Hartman --- drivers/base/devcoredump.c | 136 +++++++++++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 53 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c index 37faf6156d7c..55bdc7f5e59d 100644 --- a/drivers/base/devcoredump.c +++ b/drivers/base/devcoredump.c @@ -23,50 +23,46 @@ struct devcd_entry { void *data; size_t datalen; /* - * Here, mutex is required to serialize the calls to del_wk work between - * user/kernel space which happens when devcd is added with device_add() - * and that sends uevent to user space. User space reads the uevents, - * and calls to devcd_data_write() which try to modify the work which is - * not even initialized/queued from devcoredump. + * There are 2 races for which mutex is required. * + * The first race is between device creation and userspace writing to + * schedule immediately destruction. * + * This race is handled by arming the timer before device creation, but + * when device creation fails the timer still exists. * - * cpu0(X) cpu1(Y) + * To solve this, hold the mutex during device_add(), and set + * init_completed on success before releasing the mutex. * - * dev_coredump() uevent sent to user space - * device_add() ======================> user space process Y reads the - * uevents writes to devcd fd - * which results into writes to + * That way the timer will never fire until device_add() is called, + * it will do nothing if init_completed is not set. The timer is also + * cancelled in that case. * - * devcd_data_write() - * mod_delayed_work() - * try_to_grab_pending() - * timer_delete() - * debug_assert_init() - * INIT_DELAYED_WORK() - * schedule_delayed_work() - * - * - * Also, mutex alone would not be enough to avoid scheduling of - * del_wk work after it get flush from a call to devcd_free() - * mentioned as below. - * - * disabled_store() - * devcd_free() - * mutex_lock() devcd_data_write() - * flush_delayed_work() - * mutex_unlock() - * mutex_lock() - * mod_delayed_work() - * mutex_unlock() - * So, delete_work flag is required. + * The second race involves multiple parallel invocations of devcd_free(), + * add a deleted flag so only 1 can call the destructor. */ struct mutex mutex; - bool delete_work; + bool init_completed, deleted; struct module *owner; ssize_t (*read)(char *buffer, loff_t offset, size_t count, void *data, size_t datalen); void (*free)(void *data); + /* + * If nothing interferes and device_add() was returns success, + * del_wk will destroy the device after the timer fires. + * + * Multiple userspace processes can interfere in the working of the timer: + * - Writing to the coredump will reschedule the timer to run immediately, + * if still armed. + * + * This is handled by using "if (cancel_delayed_work()) { + * schedule_delayed_work() }", to prevent re-arming after having + * been previously fired. + * - Writing to /sys/class/devcoredump/disabled will destroy the + * coredump synchronously. + * This is handled by using disable_delayed_work_sync(), and then + * checking if deleted flag is set with &devcd->mutex held. + */ struct delayed_work del_wk; struct device *failing_dev; }; @@ -95,14 +91,27 @@ static void devcd_dev_release(struct device *dev) kfree(devcd); } +static void __devcd_del(struct devcd_entry *devcd) +{ + devcd->deleted = true; + device_del(&devcd->devcd_dev); + put_device(&devcd->devcd_dev); +} + static void devcd_del(struct work_struct *wk) { struct devcd_entry *devcd; + bool init_completed; devcd = container_of(wk, struct devcd_entry, del_wk.work); - device_del(&devcd->devcd_dev); - put_device(&devcd->devcd_dev); + /* devcd->mutex serializes against dev_coredumpm_timeout */ + mutex_lock(&devcd->mutex); + init_completed = devcd->init_completed; + mutex_unlock(&devcd->mutex); + + if (init_completed) + __devcd_del(devcd); } static ssize_t devcd_data_read(struct file *filp, struct kobject *kobj, @@ -122,12 +131,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct devcd_entry *devcd = dev_to_devcd(dev); - mutex_lock(&devcd->mutex); - if (!devcd->delete_work) { - devcd->delete_work = true; - mod_delayed_work(system_wq, &devcd->del_wk, 0); - } - mutex_unlock(&devcd->mutex); + /* + * Although it's tempting to use mod_delayed work here, + * that will cause a reschedule if the timer already fired. + */ + if (cancel_delayed_work(&devcd->del_wk)) + schedule_delayed_work(&devcd->del_wk, 0); return count; } @@ -151,11 +160,21 @@ static int devcd_free(struct device *dev, void *data) { struct devcd_entry *devcd = dev_to_devcd(dev); + /* + * To prevent a race with devcd_data_write(), disable work and + * complete manually instead. + * + * We cannot rely on the return value of + * disable_delayed_work_sync() here, because it might be in the + * middle of a cancel_delayed_work + schedule_delayed_work pair. + * + * devcd->mutex here guards against multiple parallel invocations + * of devcd_free(). + */ + disable_delayed_work_sync(&devcd->del_wk); mutex_lock(&devcd->mutex); - if (!devcd->delete_work) - devcd->delete_work = true; - - flush_delayed_work(&devcd->del_wk); + if (!devcd->deleted) + __devcd_del(devcd); mutex_unlock(&devcd->mutex); return 0; } @@ -179,12 +198,10 @@ static ssize_t disabled_show(const struct class *class, const struct class_attri * put_device() <- last reference * error = fn(dev, data) devcd_dev_release() * devcd_free(dev, data) kfree(devcd) - * mutex_lock(&devcd->mutex); * * * In the above diagram, it looks like disabled_store() would be racing with parallelly - * running devcd_del() and result in memory abort while acquiring devcd->mutex which - * is called after kfree of devcd memory after dropping its last reference with + * running devcd_del() and result in memory abort after dropping its last reference with * put_device(). However, this will not happens as fn(dev, data) runs * with its own reference to device via klist_node so it is not its last reference. * so, above situation would not occur. @@ -374,7 +391,7 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner, devcd->read = read; devcd->free = free; devcd->failing_dev = get_device(dev); - devcd->delete_work = false; + devcd->deleted = false; mutex_init(&devcd->mutex); device_initialize(&devcd->devcd_dev); @@ -383,8 +400,14 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner, atomic_inc_return(&devcd_count)); devcd->devcd_dev.class = &devcd_class; - mutex_lock(&devcd->mutex); dev_set_uevent_suppress(&devcd->devcd_dev, true); + + /* devcd->mutex prevents devcd_del() completing until init finishes */ + mutex_lock(&devcd->mutex); + devcd->init_completed = false; + INIT_DELAYED_WORK(&devcd->del_wk, devcd_del); + schedule_delayed_work(&devcd->del_wk, timeout); + if (device_add(&devcd->devcd_dev)) goto put_device; @@ -401,13 +424,20 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner, dev_set_uevent_suppress(&devcd->devcd_dev, false); kobject_uevent(&devcd->devcd_dev.kobj, KOBJ_ADD); - INIT_DELAYED_WORK(&devcd->del_wk, devcd_del); - schedule_delayed_work(&devcd->del_wk, timeout); + + /* + * Safe to run devcd_del() now that we are done with devcd_dev. + * Alternatively we could have taken a ref on devcd_dev before + * dropping the lock. + */ + devcd->init_completed = true; mutex_unlock(&devcd->mutex); return; put_device: - put_device(&devcd->devcd_dev); mutex_unlock(&devcd->mutex); + cancel_delayed_work_sync(&devcd->del_wk); + put_device(&devcd->devcd_dev); + put_module: module_put(owner); free: -- cgit v1.2.3 From 2eead19334516c8e9927c11b448fbe512b1f18a1 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Tue, 23 Sep 2025 23:13:08 +0530 Subject: arch_topology: Fix incorrect error check in topology_parse_cpu_capacity() Fix incorrect use of PTR_ERR_OR_ZERO() in topology_parse_cpu_capacity() which causes the code to proceed with NULL clock pointers. The current logic uses !PTR_ERR_OR_ZERO(cpu_clk) which evaluates to true for both valid pointers and NULL, leading to potential NULL pointer dereference in clk_get_rate(). Per include/linux/err.h documentation, PTR_ERR_OR_ZERO(ptr) returns: "The error code within @ptr if it is an error pointer; 0 otherwise." This means PTR_ERR_OR_ZERO() returns 0 for both valid pointers AND NULL pointers. Therefore !PTR_ERR_OR_ZERO(cpu_clk) evaluates to true (proceed) when cpu_clk is either valid or NULL, causing clk_get_rate(NULL) to be called when of_clk_get() returns NULL. Replace with !IS_ERR_OR_NULL(cpu_clk) which only proceeds for valid pointers, preventing potential NULL pointer dereference in clk_get_rate(). Cc: stable Signed-off-by: Kaushlendra Kumar Reviewed-by: Sudeep Holla Fixes: b8fe128dad8f ("arch_topology: Adjust initial CPU capacities with current freq") Link: https://patch.msgid.link/20250923174308.1771906-1-kaushlendra.kumar@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/arch_topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 1037169abb45..e1eff05bea4a 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -292,7 +292,7 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) * frequency (by keeping the initial capacity_freq_ref value). */ cpu_clk = of_clk_get(cpu_node, 0); - if (!PTR_ERR_OR_ZERO(cpu_clk)) { + if (!IS_ERR_OR_NULL(cpu_clk)) { per_cpu(capacity_freq_ref, cpu) = clk_get_rate(cpu_clk) / HZ_PER_KHZ; clk_put(cpu_clk); -- cgit v1.2.3 From 434f7349a1f00618a620b316f091bd13a12bc8d2 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Wed, 22 Oct 2025 21:10:12 +0100 Subject: regmap: slimbus: fix bus_context pointer in regmap init calls Commit 4e65bda8273c ("ASoC: wcd934x: fix error handling in wcd934x_codec_parse_data()") revealed the problem in the slimbus regmap. That commit breaks audio playback, for instance, on sdm845 Thundercomm Dragonboard 845c board: Unable to handle kernel paging request at virtual address ffff8000847cbad4 ... CPU: 5 UID: 0 PID: 776 Comm: aplay Not tainted 6.18.0-rc1-00028-g7ea30958b305 #11 PREEMPT Hardware name: Thundercomm Dragonboard 845c (DT) ... Call trace: slim_xfer_msg+0x24/0x1ac [slimbus] (P) slim_read+0x48/0x74 [slimbus] regmap_slimbus_read+0x18/0x24 [regmap_slimbus] _regmap_raw_read+0xe8/0x174 _regmap_bus_read+0x44/0x80 _regmap_read+0x60/0xd8 _regmap_update_bits+0xf4/0x140 _regmap_select_page+0xa8/0x124 _regmap_raw_write_impl+0x3b8/0x65c _regmap_bus_raw_write+0x60/0x80 _regmap_write+0x58/0xc0 regmap_write+0x4c/0x80 wcd934x_hw_params+0x494/0x8b8 [snd_soc_wcd934x] snd_soc_dai_hw_params+0x3c/0x7c [snd_soc_core] __soc_pcm_hw_params+0x22c/0x634 [snd_soc_core] dpcm_be_dai_hw_params+0x1d4/0x38c [snd_soc_core] dpcm_fe_dai_hw_params+0x9c/0x17c [snd_soc_core] snd_pcm_hw_params+0x124/0x464 [snd_pcm] snd_pcm_common_ioctl+0x110c/0x1820 [snd_pcm] snd_pcm_ioctl+0x34/0x4c [snd_pcm] __arm64_sys_ioctl+0xac/0x104 invoke_syscall+0x48/0x104 el0_svc_common.constprop.0+0x40/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x34/0xec el0t_64_sync_handler+0xa0/0xf0 el0t_64_sync+0x198/0x19c The __devm_regmap_init_slimbus() started to be used instead of __regmap_init_slimbus() after the commit mentioned above and turns out the incorrect bus_context pointer (3rd argument) was used in __devm_regmap_init_slimbus(). It should be just "slimbus" (which is equal to &slimbus->dev). Correct it. The wcd934x codec seems to be the only or the first user of devm_regmap_init_slimbus() but we should fix it till the point where __devm_regmap_init_slimbus() was introduced therefore two "Fixes" tags. While at this, also correct the same argument in __regmap_init_slimbus(). Fixes: 4e65bda8273c ("ASoC: wcd934x: fix error handling in wcd934x_codec_parse_data()") Fixes: 7d6f7fb053ad ("regmap: add SLIMbus support") Cc: stable@vger.kernel.org Cc: Dmitry Baryshkov Cc: Ma Ke Cc: Steev Klimaszewski Cc: Srinivas Kandagatla Reviewed-by: Abel Vesa Signed-off-by: Alexey Klimov Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20251022201013.1740211-1-alexey.klimov@linaro.org Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-slimbus.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/regmap/regmap-slimbus.c b/drivers/base/regmap/regmap-slimbus.c index 54eb7d227cf4..e523fae73004 100644 --- a/drivers/base/regmap/regmap-slimbus.c +++ b/drivers/base/regmap/regmap-slimbus.c @@ -48,8 +48,7 @@ struct regmap *__regmap_init_slimbus(struct slim_device *slimbus, if (IS_ERR(bus)) return ERR_CAST(bus); - return __regmap_init(&slimbus->dev, bus, &slimbus->dev, config, - lock_key, lock_name); + return __regmap_init(&slimbus->dev, bus, slimbus, config, lock_key, lock_name); } EXPORT_SYMBOL_GPL(__regmap_init_slimbus); @@ -63,8 +62,7 @@ struct regmap *__devm_regmap_init_slimbus(struct slim_device *slimbus, if (IS_ERR(bus)) return ERR_CAST(bus); - return __devm_regmap_init(&slimbus->dev, bus, &slimbus, config, - lock_key, lock_name); + return __devm_regmap_init(&slimbus->dev, bus, slimbus, config, lock_key, lock_name); } EXPORT_SYMBOL_GPL(__devm_regmap_init_slimbus); -- cgit v1.2.3