From 5634c8cb298a7146b4e38873473e280b50e27a2c Mon Sep 17 00:00:00 2001 From: Nitin Gote Date: Fri, 18 Jul 2025 16:20:51 +0530 Subject: iosys-map: Fix undefined behavior in iosys_map_clear() The current iosys_map_clear() implementation reads the potentially uninitialized 'is_iomem' boolean field to decide which union member to clear. This causes undefined behavior when called on uninitialized structures, as 'is_iomem' may contain garbage values like 0xFF. UBSAN detects this as: UBSAN: invalid-load in include/linux/iosys-map.h:267 load of value 255 is not a valid value for type '_Bool' Fix by unconditionally clearing the entire structure with memset(), eliminating the need to read uninitialized data and ensuring all fields are set to known good values. Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/14639 Fixes: 01fd30da0474 ("dma-buf: Add struct dma-buf-map for storing struct dma_buf.vaddr_ptr") Signed-off-by: Nitin Gote Reviewed-by: Andi Shyti Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250718105051.2709487-1-nitin.r.gote@intel.com --- include/linux/iosys-map.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h index 4696abfd311c..3e85afe794c0 100644 --- a/include/linux/iosys-map.h +++ b/include/linux/iosys-map.h @@ -264,12 +264,7 @@ static inline bool iosys_map_is_set(const struct iosys_map *map) */ static inline void iosys_map_clear(struct iosys_map *map) { - if (map->is_iomem) { - map->vaddr_iomem = NULL; - map->is_iomem = false; - } else { - map->vaddr = NULL; - } + memset(map, 0, sizeof(*map)); } /** -- cgit v1.2.3 From 9528d32873b38281ae105f2f5799e79ae9d086c2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 11 Aug 2025 10:27:45 +0200 Subject: kcov, usb: Don't disable interrupts in kcov_remote_start_usb_softirq() kcov_remote_start_usb_softirq() the begin of urb's completion callback. HCDs marked HCD_BH will invoke this function from the softirq and in_serving_softirq() will detect this properly. Root-HUB (RH) requests will not be delayed to softirq but complete immediately in IRQ context. This will confuse kcov because in_serving_softirq() will report true if the softirq is served after the hardirq and if the softirq got interrupted by the hardirq in which currently runs. This was addressed by simply disabling interrupts in kcov_remote_start_usb_softirq() which avoided the interruption by the RH while a regular completion callback was invoked. This not only changes the behaviour while kconv is enabled but also breaks PREEMPT_RT because now sleeping locks can no longer be acquired. Revert the previous fix. Address the issue by invoking kcov_remote_start_usb() only if the context is just "serving softirqs" which is identified by checking in_serving_softirq() and in_hardirq() must be false. Fixes: f85d39dd7ed89 ("kcov, usb: disable interrupts in kcov_remote_start_usb_softirq") Cc: stable Reported-by: Yunseong Kim Closes: https://lore.kernel.org/all/20250725201400.1078395-2-ysk@kzalloc.com/ Tested-by: Yunseong Kim Signed-off-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20250811082745.ycJqBXMs@linutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 12 +++++------- include/linux/kcov.h | 47 +++++++++-------------------------------------- 2 files changed, 14 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index c4a1875b5d3d..6270fbb5c699 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1636,7 +1636,6 @@ static void __usb_hcd_giveback_urb(struct urb *urb) struct usb_hcd *hcd = bus_to_hcd(urb->dev->bus); struct usb_anchor *anchor = urb->anchor; int status = urb->unlinked; - unsigned long flags; urb->hcpriv = NULL; if (unlikely((urb->transfer_flags & URB_SHORT_NOT_OK) && @@ -1654,14 +1653,13 @@ static void __usb_hcd_giveback_urb(struct urb *urb) /* pass ownership to the completion handler */ urb->status = status; /* - * Only collect coverage in the softirq context and disable interrupts - * to avoid scenarios with nested remote coverage collection sections - * that KCOV does not support. - * See the comment next to kcov_remote_start_usb_softirq() for details. + * This function can be called in task context inside another remote + * coverage collection section, but kcov doesn't support that kind of + * recursion yet. Only collect coverage in softirq context for now. */ - flags = kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum); + kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum); urb->complete(urb); - kcov_remote_stop_softirq(flags); + kcov_remote_stop_softirq(); usb_anchor_resume_wakeups(anchor); atomic_dec(&urb->use_count); diff --git a/include/linux/kcov.h b/include/linux/kcov.h index 75a2fb8b16c3..0143358874b0 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -57,47 +57,21 @@ static inline void kcov_remote_start_usb(u64 id) /* * The softirq flavor of kcov_remote_*() functions is introduced as a temporary - * workaround for KCOV's lack of nested remote coverage sections support. - * - * Adding support is tracked in https://bugzilla.kernel.org/show_bug.cgi?id=210337. - * - * kcov_remote_start_usb_softirq(): - * - * 1. Only collects coverage when called in the softirq context. This allows - * avoiding nested remote coverage collection sections in the task context. - * For example, USB/IP calls usb_hcd_giveback_urb() in the task context - * within an existing remote coverage collection section. Thus, KCOV should - * not attempt to start collecting coverage within the coverage collection - * section in __usb_hcd_giveback_urb() in this case. - * - * 2. Disables interrupts for the duration of the coverage collection section. - * This allows avoiding nested remote coverage collection sections in the - * softirq context (a softirq might occur during the execution of a work in - * the BH workqueue, which runs with in_serving_softirq() > 0). - * For example, usb_giveback_urb_bh() runs in the BH workqueue with - * interrupts enabled, so __usb_hcd_giveback_urb() might be interrupted in - * the middle of its remote coverage collection section, and the interrupt - * handler might invoke __usb_hcd_giveback_urb() again. + * work around for kcov's lack of nested remote coverage sections support in + * task context. Adding support for nested sections is tracked in: + * https://bugzilla.kernel.org/show_bug.cgi?id=210337 */ -static inline unsigned long kcov_remote_start_usb_softirq(u64 id) +static inline void kcov_remote_start_usb_softirq(u64 id) { - unsigned long flags = 0; - - if (in_serving_softirq()) { - local_irq_save(flags); + if (in_serving_softirq() && !in_hardirq()) kcov_remote_start_usb(id); - } - - return flags; } -static inline void kcov_remote_stop_softirq(unsigned long flags) +static inline void kcov_remote_stop_softirq(void) { - if (in_serving_softirq()) { + if (in_serving_softirq() && !in_hardirq()) kcov_remote_stop(); - local_irq_restore(flags); - } } #ifdef CONFIG_64BIT @@ -131,11 +105,8 @@ static inline u64 kcov_common_handle(void) } static inline void kcov_remote_start_common(u64 id) {} static inline void kcov_remote_start_usb(u64 id) {} -static inline unsigned long kcov_remote_start_usb_softirq(u64 id) -{ - return 0; -} -static inline void kcov_remote_stop_softirq(unsigned long flags) {} +static inline void kcov_remote_start_usb_softirq(u64 id) {} +static inline void kcov_remote_stop_softirq(void) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */ -- cgit v1.2.3 From 8ea815399c3fcce1889bd951fec25b5b9a3979c1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 14 Apr 2025 16:41:07 +0200 Subject: compiler: remove __ADDRESSABLE_ASM{_STR,}() again __ADDRESSABLE_ASM_STR() is where the necessary stringification happens. As long as "sym" doesn't contain any odd characters, no quoting is required for its use with .quad / .long. In fact the quotation gets in the way with gas 2.25; it's only from 2.26 onwards that quoted symbols are half-way properly supported. However, assembly being different from C anyway, drop __ADDRESSABLE_ASM_STR() and its helper macro altogether. A simple .global directive will suffice to get the symbol "declared", i.e. into the symbol table. While there also stop open-coding STATIC_CALL_TRAMP() and STATIC_CALL_KEY(). Fixes: 0ef8047b737d ("x86/static-call: provide a way to do very early static-call updates") Signed-off-by: Jan Beulich Acked-by: Josh Poimboeuf Cc: stable@vger.kernel.org Signed-off-by: Juergen Gross Message-ID: <609d2c74-de13-4fae-ab1a-1ec44afb948d@suse.com> --- arch/x86/include/asm/xen/hypercall.h | 5 +++-- include/linux/compiler.h | 8 -------- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 59a62c3780a2..a16d4631547c 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -94,12 +94,13 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); #ifdef MODULE #define __ADDRESSABLE_xen_hypercall #else -#define __ADDRESSABLE_xen_hypercall __ADDRESSABLE_ASM_STR(__SCK__xen_hypercall) +#define __ADDRESSABLE_xen_hypercall \ + __stringify(.global STATIC_CALL_KEY(xen_hypercall);) #endif #define __HYPERCALL \ __ADDRESSABLE_xen_hypercall \ - "call __SCT__xen_hypercall" + __stringify(call STATIC_CALL_TRAMP(xen_hypercall)) #define __HYPERCALL_ENTRY(x) "a" (x) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 6f04a1d8c720..64ff73c533e5 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -288,14 +288,6 @@ static inline void *offset_to_ptr(const int *off) #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -#define __ADDRESSABLE_ASM(sym) \ - .pushsection .discard.addressable,"aw"; \ - .align ARCH_SEL(8,4); \ - ARCH_SEL(.quad, .long) __stringify(sym); \ - .popsection; - -#define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) - /* * This returns a constant expression while determining if an argument is * a constant expression, most importantly without evaluating the argument. -- cgit v1.2.3 From 76d2e3890fb169168c73f2e4f8375c7cc24a765e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 16 Aug 2025 07:25:20 -0700 Subject: NFS: Fix a race when updating an existing write After nfs_lock_and_join_requests() tests for whether the request is still attached to the mapping, nothing prevents a call to nfs_inode_remove_request() from succeeding until we actually lock the page group. The reason is that whoever called nfs_inode_remove_request() doesn't necessarily have a lock on the page group head. So in order to avoid races, let's take the page group lock earlier in nfs_lock_and_join_requests(), and hold it across the removal of the request in nfs_inode_remove_request(). Reported-by: Jeff Layton Tested-by: Joe Quanaim Tested-by: Andrew Steffen Reviewed-by: Jeff Layton Fixes: bd37d6fce184 ("NFSv4: Convert nfs_lock_and_join_requests() to use nfs_page_find_head_request()") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 9 +++++---- fs/nfs/write.c | 29 ++++++++++------------------- include/linux/nfs_page.h | 1 + 3 files changed, 16 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 11968dcb7243..6e69ce43a13f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -253,13 +253,14 @@ nfs_page_group_unlock(struct nfs_page *req) nfs_page_clear_headlock(req); } -/* - * nfs_page_group_sync_on_bit_locked +/** + * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set + * @req: request in page group + * @bit: PG_* bit that is used to sync page group * * must be called with page group lock held */ -static bool -nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) { struct nfs_page *head = req->wb_head; struct nfs_page *tmp; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index fa5c41d0989a..8b7c04737967 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -153,20 +153,10 @@ nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) } } -static int -nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) { - int ret; - - if (!test_bit(PG_REMOVE, &req->wb_flags)) - return 0; - ret = nfs_page_group_lock(req); - if (ret) - return ret; if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) nfs_page_set_inode_ref(req, inode); - nfs_page_group_unlock(req); - return 0; } /** @@ -585,19 +575,18 @@ retry: } } + ret = nfs_page_group_lock(head); + if (ret < 0) + goto out_unlock; + /* Ensure that nobody removed the request before we locked it */ if (head != folio->private) { + nfs_page_group_unlock(head); nfs_unlock_and_release_request(head); goto retry; } - ret = nfs_cancel_remove_inode(head, inode); - if (ret < 0) - goto out_unlock; - - ret = nfs_page_group_lock(head); - if (ret < 0) - goto out_unlock; + nfs_cancel_remove_inode(head, inode); /* lock each request in the page group */ for (subreq = head->wb_this_page; @@ -786,7 +775,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); - if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { + nfs_page_group_lock(req); + if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) { struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio->mapping; @@ -798,6 +788,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) } spin_unlock(&mapping->i_private_lock); } + nfs_page_group_unlock(req); if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { atomic_long_dec(&nfsi->nrequests); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 169b4ae30ff4..9aed39abc94b 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -160,6 +160,7 @@ extern void nfs_join_page_group(struct nfs_page *head, extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); +extern bool nfs_page_group_sync_on_bit_locked(struct nfs_page *, unsigned int); extern int nfs_page_set_headlock(struct nfs_page *req); extern void nfs_page_clear_headlock(struct nfs_page *req); extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *); -- cgit v1.2.3 From 808471ddb0fa785559c3e7aee59be20a13b46ef5 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Wed, 13 Aug 2025 15:04:55 +0900 Subject: iov_iter: iterate_folioq: fix handling of offset >= folio size It's apparently possible to get an iov advanced all the way up to the end of the current page we're looking at, e.g. (gdb) p *iter $24 = {iter_type = 4 '\004', nofault = false, data_source = false, iov_offset = 4096, {__ubuf_iovec = { iov_base = 0xffff88800f5bc000, iov_len = 655}, {{__iov = 0xffff88800f5bc000, kvec = 0xffff88800f5bc000, bvec = 0xffff88800f5bc000, folioq = 0xffff88800f5bc000, xarray = 0xffff88800f5bc000, ubuf = 0xffff88800f5bc000}, count = 655}}, {nr_segs = 2, folioq_slot = 2 '\002', xarray_start = 2}} Where iov_offset is 4k with 4k-sized folios This should have been fine because we're only in the 2nd slot and there's another one after this, but iterate_folioq should not try to map a folio that skips the whole size, and more importantly part here does not end up zero (because 'PAGE_SIZE - skip % PAGE_SIZE' ends up PAGE_SIZE and not zero..), so skip forward to the "advance to next folio" code Link: https://lkml.kernel.org/r/20250813-iot_iter_folio-v3-0-a0ffad2b665a@codewreck.org Link: https://lkml.kernel.org/r/20250813-iot_iter_folio-v3-1-a0ffad2b665a@codewreck.org Signed-off-by: Dominique Martinet Fixes: db0aa2e9566f ("mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios") Reported-by: Maximilian Bosch Reported-by: Ryan Lahfa Reported-by: Christian Theune Reported-by: Arnout Engelen Link: https://lkml.kernel.org/r/D4LHHUNLG79Y.12PI0X6BEHRHW@mbosch.me/ Acked-by: David Howells Cc: Al Viro Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Cc: [6.12+] Signed-off-by: Andrew Morton --- include/linux/iov_iter.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h index c4aa58032faf..f9a17fbbd398 100644 --- a/include/linux/iov_iter.h +++ b/include/linux/iov_iter.h @@ -160,7 +160,7 @@ size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2 do { struct folio *folio = folioq_folio(folioq, slot); - size_t part, remain, consumed; + size_t part, remain = 0, consumed; size_t fsize; void *base; @@ -168,14 +168,16 @@ size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2 break; fsize = folioq_folio_size(folioq, slot); - base = kmap_local_folio(folio, skip); - part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); - remain = step(base, progress, part, priv, priv2); - kunmap_local(base); - consumed = part - remain; - len -= consumed; - progress += consumed; - skip += consumed; + if (skip < fsize) { + base = kmap_local_folio(folio, skip); + part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); + remain = step(base, progress, part, priv, priv2); + kunmap_local(base); + consumed = part - remain; + len -= consumed; + progress += consumed; + skip += consumed; + } if (skip >= fsize) { skip = 0; slot++; -- cgit v1.2.3 From 053c8ebe74f7e1f4c072e59428da80b9d78bc4b7 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 17 Aug 2025 23:17:59 +0800 Subject: mm/migrate: fix NULL movable_ops if CONFIG_ZSMALLOC=m After commit 84caf98838a3e5f4bdb34 ("mm: stop storing migration_ops in page->mapping") we get such an error message if CONFIG_ZSMALLOC=m: WARNING: CPU: 3 PID: 42 at mm/migrate.c:142 isolate_movable_ops_page+0xa8/0x1c0 CPU: 3 UID: 0 PID: 42 Comm: kcompactd0 Not tainted 6.16.0-rc5+ #2133 PREEMPT pc 9000000000540bd8 ra 9000000000540b84 tp 9000000100420000 sp 9000000100423a60 a0 9000000100193a80 a1 000000000000000c a2 000000000000001b a3 ffffffffffffffff a4 ffffffffffffffff a5 0000000000000267 a6 0000000000000000 a7 9000000100423ae0 t0 00000000000000f1 t1 00000000000000f6 t2 0000000000000000 t3 0000000000000001 t4 ffffff00010eb834 t5 0000000000000040 t6 900000010c89d380 t7 90000000023fcc70 t8 0000000000000018 u0 0000000000000000 s9 ffffff00010eb800 s0 ffffff00010eb800 s1 000000000000000c s2 0000000000043ae0 s3 0000800000000000 s4 900000000219cc40 s5 0000000000000000 s6 ffffff00010eb800 s7 0000000000000001 s8 90000000025b4000 ra: 9000000000540b84 isolate_movable_ops_page+0x54/0x1c0 ERA: 9000000000540bd8 isolate_movable_ops_page+0xa8/0x1c0 CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) PRMD: 00000004 (PPLV0 +PIE -PWE) EUEN: 00000000 (-FPE -SXE -ASXE -BTE) ECFG: 00071c1d (LIE=0,2-4,10-12 VS=7) ESTAT: 000c0000 [BRK] (IS= ECode=12 EsubCode=0) PRID: 0014c010 (Loongson-64bit, Loongson-3A5000) CPU: 3 UID: 0 PID: 42 Comm: kcompactd0 Not tainted 6.16.0-rc5+ #2133 PREEMPT Stack : 90000000021fd000 0000000000000000 9000000000247720 9000000100420000 90000001004236a0 90000001004236a8 0000000000000000 90000001004237e8 90000001004237e0 90000001004237e0 9000000100423550 0000000000000001 0000000000000001 90000001004236a8 725a84864a19e2d9 90000000023fcc58 9000000100420000 90000000024c6848 9000000002416848 0000000000000001 0000000000000000 000000000000000a 0000000007fe0000 ffffff00010eb800 0000000000000000 90000000021fd000 0000000000000000 900000000205cf30 000000000000008e 0000000000000009 ffffff00010eb800 0000000000000001 90000000025b4000 0000000000000000 900000000024773c 00007ffff103d748 00000000000000b0 0000000000000004 0000000000000000 0000000000071c1d ... Call Trace: [<900000000024773c>] show_stack+0x5c/0x190 [<90000000002415e0>] dump_stack_lvl+0x70/0x9c [<90000000004abe6c>] isolate_migratepages_block+0x3bc/0x16e0 [<90000000004af408>] compact_zone+0x558/0x1000 [<90000000004b0068>] compact_node+0xa8/0x1e0 [<90000000004b0aa4>] kcompactd+0x394/0x410 [<90000000002b3c98>] kthread+0x128/0x140 [<9000000001779148>] ret_from_kernel_thread+0x28/0xc0 [<9000000000245528>] ret_from_kernel_thread_asm+0x10/0x88 The reason is that defined(CONFIG_ZSMALLOC) evaluates to 1 only when CONFIG_ZSMALLOC=y, we should use IS_ENABLED(CONFIG_ZSMALLOC) instead. But when I use IS_ENABLED(CONFIG_ZSMALLOC), page_movable_ops() cannot access zsmalloc_mops because zsmalloc_mops is in a module. To solve this problem, we define a set_movable_ops() interface to register and unregister offline_movable_ops / zsmalloc_movable_ops in mm/migrate.c, and call them at mm/balloon_compaction.c & mm/zsmalloc.c. Since offline_movable_ops / zsmalloc_movable_ops are always accessible, all #ifdef / #endif are removed in page_movable_ops(). Link: https://lkml.kernel.org/r/20250817151759.2525174-1-chenhuacai@loongson.cn Fixes: 84caf98838a3 ("mm: stop storing migration_ops in page->mapping") Signed-off-by: Huacai Chen Acked-by: Zi Yan Acked-by: David Hildenbrand Cc: Huacai Chen Cc: Huacai Chen Cc: Lorenzo Stoakes Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- include/linux/migrate.h | 5 +++++ mm/balloon_compaction.c | 6 ++++++ mm/migrate.c | 38 ++++++++++++++++++++++++++++++-------- mm/zsmalloc.c | 10 ++++++++++ 4 files changed, 51 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index acadd41e0b5c..9009e27b5f44 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -79,6 +79,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); +int set_movable_ops(const struct movable_operations *ops, enum pagetype type); #else @@ -100,6 +101,10 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, { return -ENOSYS; } +static inline int set_movable_ops(const struct movable_operations *ops, enum pagetype type) +{ + return -ENOSYS; +} #endif /* CONFIG_MIGRATION */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 2a4a649805c1..03c5dbabb156 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -254,4 +254,10 @@ const struct movable_operations balloon_mops = { .putback_page = balloon_page_putback, }; +static int __init balloon_init(void) +{ + return set_movable_ops(&balloon_mops, PGTY_offline); +} +core_initcall(balloon_init); + #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/migrate.c b/mm/migrate.c index 425401b2d4e1..9e5ef39ce73a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -43,8 +43,6 @@ #include #include #include -#include -#include #include @@ -53,6 +51,33 @@ #include "internal.h" #include "swap.h" +static const struct movable_operations *offline_movable_ops; +static const struct movable_operations *zsmalloc_movable_ops; + +int set_movable_ops(const struct movable_operations *ops, enum pagetype type) +{ + /* + * We only allow for selected types and don't handle concurrent + * registration attempts yet. + */ + switch (type) { + case PGTY_offline: + if (offline_movable_ops && ops) + return -EBUSY; + offline_movable_ops = ops; + break; + case PGTY_zsmalloc: + if (zsmalloc_movable_ops && ops) + return -EBUSY; + zsmalloc_movable_ops = ops; + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL_GPL(set_movable_ops); + static const struct movable_operations *page_movable_ops(struct page *page) { VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); @@ -62,15 +87,12 @@ static const struct movable_operations *page_movable_ops(struct page *page) * it as movable, the page type must be sticky until the page gets freed * back to the buddy. */ -#ifdef CONFIG_BALLOON_COMPACTION if (PageOffline(page)) /* Only balloon compaction sets PageOffline pages movable. */ - return &balloon_mops; -#endif /* CONFIG_BALLOON_COMPACTION */ -#if defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) + return offline_movable_ops; if (PageZsmalloc(page)) - return &zsmalloc_mops; -#endif /* defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) */ + return zsmalloc_movable_ops; + return NULL; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2c5e56a65354..805a10b41266 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2246,8 +2246,15 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool); static int __init zs_init(void) { + int rc __maybe_unused; + #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); +#endif +#ifdef CONFIG_COMPACTION + rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc); + if (rc) + return rc; #endif zs_stat_init(); return 0; @@ -2257,6 +2264,9 @@ static void __exit zs_exit(void) { #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); +#endif +#ifdef CONFIG_COMPACTION + set_movable_ops(NULL, PGTY_zsmalloc); #endif zs_stat_exit(); } -- cgit v1.2.3 From 8ef7f3132e4005a103b382e71abea7ad01fbeb86 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Wed, 20 Aug 2025 22:23:44 +0800 Subject: LoongArch: Add cpuhotplug hooks to fix high cpu usage of vCPU threads When the CPU is offline, the timer of LoongArch is not correctly closed. This is harmless for real machines, but resulting in an excessively high cpu usage rate of the offline vCPU thread in the virtual machines. To correctly close the timer, we have made the following modifications: Register the cpu hotplug event (CPUHP_AP_LOONGARCH_ARCH_TIMER_STARTING) for LoongArch. This event's hooks will be called to close the timer when the CPU is offline. Clear the timer interrupt when the timer is turned off. Since before the timer is turned off, there may be a timer interrupt that has already been in the pending state due to the interruption of the disabled, which also affects the halt state of the offline vCPU. Signed-off-by: Xianglai Li Signed-off-by: Huacai Chen --- arch/loongarch/kernel/time.c | 22 ++++++++++++++++++++++ include/linux/cpuhotplug.h | 1 + 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index 367906b10f81..f3092f2de8b5 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -5,6 +5,7 @@ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ #include +#include #include #include #include @@ -102,6 +103,23 @@ static int constant_timer_next_event(unsigned long delta, struct clock_event_dev return 0; } +static int arch_timer_starting(unsigned int cpu) +{ + set_csr_ecfg(ECFGF_TIMER); + + return 0; +} + +static int arch_timer_dying(unsigned int cpu) +{ + constant_set_state_shutdown(this_cpu_ptr(&constant_clockevent_device)); + + /* Clear Timer Interrupt */ + write_csr_tintclear(CSR_TINTCLR_TI); + + return 0; +} + static unsigned long get_loops_per_jiffy(void) { unsigned long lpj = (unsigned long)const_clock_freq; @@ -172,6 +190,10 @@ int constant_clockevent_init(void) lpj_fine = get_loops_per_jiffy(); pr_info("Constant clock event device register\n"); + cpuhp_setup_state(CPUHP_AP_LOONGARCH_ARCH_TIMER_STARTING, + "clockevents/loongarch/timer:starting", + arch_timer_starting, arch_timer_dying); + return 0; } diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index edfa61d80702..62cd7b35a29c 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -168,6 +168,7 @@ enum cpuhp_state { CPUHP_AP_QCOM_TIMER_STARTING, CPUHP_AP_TEGRA_TIMER_STARTING, CPUHP_AP_ARMADA_TIMER_STARTING, + CPUHP_AP_LOONGARCH_ARCH_TIMER_STARTING, CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, CPUHP_AP_REALTEK_TIMER_STARTING, -- cgit v1.2.3 From 370ac285f23aecae40600851fb4a1a9e75e50973 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 14 Aug 2025 13:54:59 +0530 Subject: block: avoid cpu_hotplug_lock depedency on freeze_lock A recent lockdep[1] splat observed while running blktest block/005 reveals a potential deadlock caused by the cpu_hotplug_lock dependency on ->freeze_lock. This dependency was introduced by commit 033b667a823e ("block: blk-rq-qos: guard rq-qos helpers by static key"). That change added a static key to avoid fetching q->rq_qos when neither blk-wbt nor blk-iolatency is configured. The static key dynamically patches kernel text to a NOP when disabled, eliminating overhead of fetching q->rq_qos in the I/O hot path. However, enabling a static key at runtime requires acquiring both cpu_hotplug_lock and jump_label_mutex. When this happens after the queue has already been frozen (i.e., while holding ->freeze_lock), it creates a locking dependency from cpu_hotplug_lock to ->freeze_lock, which leads to a potential deadlock reported by lockdep [1]. To resolve this, replace the static key mechanism with q->queue_flags: QUEUE_FLAG_QOS_ENABLED. This flag is evaluated in the fast path before accessing q->rq_qos. If the flag is set, we proceed to fetch q->rq_qos; otherwise, the access is skipped. Since q->queue_flags is commonly accessed in IO hotpath and resides in the first cacheline of struct request_queue, checking it imposes minimal overhead while eliminating the deadlock risk. This change avoids the lockdep splat without introducing performance regressions. [1] https://lore.kernel.org/linux-block/4fdm37so3o4xricdgfosgmohn63aa7wj3ua4e5vpihoamwg3ui@fq42f5q5t5ic/ Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/linux-block/4fdm37so3o4xricdgfosgmohn63aa7wj3ua4e5vpihoamwg3ui@fq42f5q5t5ic/ Fixes: 033b667a823e ("block: blk-rq-qos: guard rq-qos helpers by static key") Tested-by: Shin'ichiro Kawasaki Signed-off-by: Nilay Shroff Reviewed-by: Ming Lei Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250814082612.500845-4-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 + block/blk-rq-qos.c | 9 ++++----- block/blk-rq-qos.h | 54 +++++++++++++++++++++++++++++--------------------- include/linux/blkdev.h | 1 + 4 files changed, 37 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7ed3e71f2fc0..32c65efdda46 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -95,6 +95,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(SQ_SCHED), QUEUE_FLAG_NAME(DISABLE_WBT_DEF), QUEUE_FLAG_NAME(NO_ELV_SWITCH), + QUEUE_FLAG_NAME(QOS_ENABLED), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index b1e24bb85ad2..654478dfbc20 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -2,8 +2,6 @@ #include "blk-rq-qos.h" -__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos); - /* * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, * false if 'v' + 1 would be bigger than 'below'. @@ -319,8 +317,8 @@ void rq_qos_exit(struct request_queue *q) struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; rqos->ops->exit(rqos); - static_branch_dec(&block_rq_qos); } + blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q); mutex_unlock(&q->rq_qos_mutex); } @@ -346,7 +344,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; - static_branch_inc(&block_rq_qos); + blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); @@ -374,10 +372,11 @@ void rq_qos_del(struct rq_qos *rqos) for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { if (*cur == rqos) { *cur = rqos->next; - static_branch_dec(&block_rq_qos); break; } } + if (!q->rq_qos) + blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); mutex_lock(&q->debugfs_mutex); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 28125fc49eff..1fe22000a379 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -12,7 +12,6 @@ #include "blk-mq-debugfs.h" struct blk_mq_debugfs_attr; -extern struct static_key_false block_rq_qos; enum rq_qos_id { RQ_QOS_WBT, @@ -113,49 +112,55 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) __rq_qos_cleanup(q->rq_qos, bio); } static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos && - !blk_rq_is_passthrough(rq)) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } static inline void rq_qos_issue(struct request_queue *q, struct request *rq) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) __rq_qos_issue(q->rq_qos, rq); } static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) __rq_qos_requeue(q->rq_qos, rq); } static inline void rq_qos_done_bio(struct bio *bio) { - if (static_branch_unlikely(&block_rq_qos) && - bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || - bio_flagged(bio, BIO_QOS_MERGED))) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - - /* - * If a bio has BIO_QOS_xxx set, it implicitly implies that - * q->rq_qos is present. So, we skip re-checking q->rq_qos - * here as an extra optimization and directly call - * __rq_qos_done_bio(). - */ - __rq_qos_done_bio(q->rq_qos, bio); - } + struct request_queue *q; + + if (!bio->bi_bdev || (!bio_flagged(bio, BIO_QOS_THROTTLED) && + !bio_flagged(bio, BIO_QOS_MERGED))) + return; + + q = bdev_get_queue(bio->bi_bdev); + + /* + * If a bio has BIO_QOS_xxx set, it implicitly implies that + * q->rq_qos is present. So, we skip re-checking q->rq_qos + * here as an extra optimization and directly call + * __rq_qos_done_bio(). + */ + __rq_qos_done_bio(q->rq_qos, bio); } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) { bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } @@ -164,14 +169,16 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) static inline void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) __rq_qos_track(q->rq_qos, rq, bio); } static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) { bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); } @@ -179,7 +186,8 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq, static inline void rq_qos_queue_depth_changed(struct request_queue *q) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) __rq_qos_queue_depth_changed(q->rq_qos); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 95886b404b16..fe1797bbec42 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -656,6 +656,7 @@ enum { QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ + QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ QUEUE_FLAG_MAX }; -- cgit v1.2.3