diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-12 12:13:01 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-12 12:13:01 -0800 |
| commit | 136114e0abf03005e182d75761ab694648e6d388 (patch) | |
| tree | 05c61b103fc9cb72a7cae99680a4b524347e9616 /include/linux | |
| parent | 4cff5c05e076d2ee4e34122aa956b84a2eaac587 (diff) | |
| parent | 0dddf20b4fd4afd59767acc144ad4da60259f21f (diff) | |
Merge tag 'mm-nonmm-stable-2026-02-12-10-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull non-MM updates from Andrew Morton:
- "ocfs2: give ocfs2 the ability to reclaim suballocator free bg" saves
disk space by teaching ocfs2 to reclaim suballocator block group
space (Heming Zhao)
- "Add ARRAY_END(), and use it to fix off-by-one bugs" adds the
ARRAY_END() macro and uses it in various places (Alejandro Colomar)
- "vmcoreinfo: support VMCOREINFO_BYTES larger than PAGE_SIZE" makes
the vmcore code future-safe, if VMCOREINFO_BYTES ever exceeds the
page size (Pnina Feder)
- "kallsyms: Prevent invalid access when showing module buildid" cleans
up kallsyms code related to module buildid and fixes an invalid
access crash when printing backtraces (Petr Mladek)
- "Address page fault in ima_restore_measurement_list()" fixes a
kexec-related crash that can occur when booting the second-stage
kernel on x86 (Harshit Mogalapalli)
- "kho: ABI headers and Documentation updates" updates the kexec
handover ABI documentation (Mike Rapoport)
- "Align atomic storage" adds the __aligned attribute to atomic_t and
atomic64_t definitions to get natural alignment of both types on
csky, m68k, microblaze, nios2, openrisc and sh (Finn Thain)
- "kho: clean up page initialization logic" simplifies the page
initialization logic in kho_restore_page() (Pratyush Yadav)
- "Unload linux/kernel.h" moves several things out of kernel.h and into
more appropriate places (Yury Norov)
- "don't abuse task_struct.group_leader" removes the usage of
->group_leader when it is "obviously unnecessary" (Oleg Nesterov)
- "list private v2 & luo flb" adds some infrastructure improvements to
the live update orchestrator (Pasha Tatashin)
* tag 'mm-nonmm-stable-2026-02-12-10-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (107 commits)
watchdog/hardlockup: simplify perf event probe and remove per-cpu dependency
procfs: fix missing RCU protection when reading real_parent in do_task_stat()
watchdog/softlockup: fix sample ring index wrap in need_counting_irqs()
kcsan, compiler_types: avoid duplicate type issues in BPF Type Format
kho: fix doc for kho_restore_pages()
tests/liveupdate: add in-kernel liveupdate test
liveupdate: luo_flb: introduce File-Lifecycle-Bound global state
liveupdate: luo_file: Use private list
list: add kunit test for private list primitives
list: add primitives for private list manipulations
delayacct: fix uapi timespec64 definition
panic: add panic_force_cpu= parameter to redirect panic to a specific CPU
netclassid: use thread_group_leader(p) in update_classid_task()
RDMA/umem: don't abuse current->group_leader
drm/pan*: don't abuse current->group_leader
drm/amd: kill the outdated "Only the pthreads threading model is supported" checks
drm/amdgpu: don't abuse current->group_leader
android/binder: use same_thread_group(proc->tsk, current) in binder_mmap()
android/binder: don't abuse current->group_leader
kho: skip memoryless NUMA nodes when reserving scratch areas
...
Diffstat (limited to 'include/linux')
28 files changed, 1051 insertions, 282 deletions
diff --git a/include/linux/array_size.h b/include/linux/array_size.h index 06d7d83196ca..0c4fec98822e 100644 --- a/include/linux/array_size.h +++ b/include/linux/array_size.h @@ -10,4 +10,10 @@ */ #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) +/** + * ARRAY_END - get a pointer to one past the last element in array @arr + * @arr: array + */ +#define ARRAY_END(arr) (&(arr)[ARRAY_SIZE(arr)]) + #endif /* _LINUX_ARRAY_SIZE_H */ diff --git a/include/linux/capability.h b/include/linux/capability.h index 1fb08922552c..37db92b3d6f8 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -203,6 +203,12 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) ns_capable(ns, CAP_SYS_ADMIN); } +static inline bool checkpoint_restore_ns_capable_noaudit(struct user_namespace *ns) +{ + return ns_capable_noaudit(ns, CAP_CHECKPOINT_RESTORE) || + ns_capable_noaudit(ns, CAP_SYS_ADMIN); +} + /* audit system wants to get cap info from files as well */ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 7edf1a07b535..e1123dd28486 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -153,4 +153,4 @@ * Bindgen uses LLVM even if our C compiler is GCC, so we cannot * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL. */ -#define CC_HAS_TYPEOF_UNQUAL (__clang_major__ >= 19) +#define CC_HAS_TYPEOF_UNQUAL (__clang_major__ > 19 || (__clang_major__ == 19 && __clang_minor__ > 0)) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 3c936b129860..b1b141394d13 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -289,6 +289,22 @@ struct ftrace_likely_data { # define __no_kasan_or_inline __always_inline #endif +#ifdef CONFIG_KCSAN +/* + * Type qualifier to mark variables where all data-racy accesses should be + * ignored by KCSAN. Note, the implementation simply marks these variables as + * volatile, since KCSAN will treat such accesses as "marked". + * + * Defined here because defining __data_racy as volatile for KCSAN objects only + * causes problems in BPF Type Format (BTF) generation since struct members + * of core kernel data structs will be volatile in some objects and not in + * others. Instead define it globally for KCSAN kernels. + */ +# define __data_racy volatile +#else +# define __data_racy +#endif + #ifdef __SANITIZE_THREAD__ /* * Clang still emits instrumentation for __tsan_func_{entry,exit}() and builtin @@ -300,16 +316,9 @@ struct ftrace_likely_data { * disable all instrumentation. See Kconfig.kcsan where this is mandatory. */ # define __no_kcsan __no_sanitize_thread __disable_sanitizer_instrumentation -/* - * Type qualifier to mark variables where all data-racy accesses should be - * ignored by KCSAN. Note, the implementation simply marks these variables as - * volatile, since KCSAN will treat such accesses as "marked". - */ -# define __data_racy volatile # define __no_sanitize_or_inline __no_kcsan notrace __maybe_unused #else # define __no_kcsan -# define __data_racy #endif #ifdef __SANITIZE_MEMORY__ diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 800dcc360db2..ecb06f16d22c 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -69,6 +69,14 @@ struct task_delay_info { u32 compact_count; /* total count of memory compact */ u32 wpcopy_count; /* total count of write-protect copy */ u32 irq_count; /* total count of IRQ/SOFTIRQ */ + + struct timespec64 blkio_delay_max_ts; + struct timespec64 swapin_delay_max_ts; + struct timespec64 freepages_delay_max_ts; + struct timespec64 thrashing_delay_max_ts; + struct timespec64 compact_delay_max_ts; + struct timespec64 wpcopy_delay_max_ts; + struct timespec64 irq_delay_max_ts; }; #endif diff --git a/include/linux/filter.h b/include/linux/filter.h index 4e1cb4f91f49..44d7ae95ddbc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1376,24 +1376,13 @@ static inline bool bpf_jit_kallsyms_enabled(void) return false; } -int __bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char *sym); +int bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym); bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); struct bpf_prog *bpf_prog_ksym_find(unsigned long addr); -static inline int -bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) -{ - int ret = __bpf_address_lookup(addr, size, off, sym); - - if (ret && modname) - *modname = NULL; - return ret; -} - void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); @@ -1432,8 +1421,8 @@ static inline bool bpf_jit_kallsyms_enabled(void) } static inline int -__bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char *sym) +bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) { return 0; } @@ -1454,13 +1443,6 @@ static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) return NULL; } -static inline int -bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) -{ - return 0; -} - static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) { } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 705db0a6d995..1a4d36fc9085 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -88,11 +88,13 @@ struct ftrace_func_entry; defined(CONFIG_DYNAMIC_FTRACE) int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym); + unsigned long *off, char **modname, + const unsigned char **modbuildid, char *sym); #else static inline int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) + unsigned long *off, char **modname, + const unsigned char **modbuildid, char *sym) { return 0; } diff --git a/include/linux/ima.h b/include/linux/ima.h index 8e29cb4e6a01..abf8923f8fc5 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -69,6 +69,7 @@ static inline int ima_measure_critical_data(const char *event_label, #ifdef CONFIG_HAVE_IMA_KEXEC int __init ima_free_kexec_buffer(void); int __init ima_get_kexec_buffer(void **addr, size_t *size); +int ima_validate_range(phys_addr_t phys, size_t size); #endif #ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 711a1f0d1a73..a1b4cf81adc2 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -7,6 +7,7 @@ #ifndef _LINUX_INSTRUMENTED_H #define _LINUX_INSTRUMENTED_H +#include <linux/bug.h> #include <linux/compiler.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> @@ -55,6 +56,19 @@ static __always_inline void instrument_read_write(const volatile void *v, size_t kcsan_check_read_write(v, size); } +static __always_inline void instrument_atomic_check_alignment(const volatile void *v, size_t size) +{ +#ifndef __DISABLE_EXPORTS + if (IS_ENABLED(CONFIG_DEBUG_ATOMIC)) { + unsigned int mask = size - 1; + + if (IS_ENABLED(CONFIG_DEBUG_ATOMIC_LARGEST_ALIGN)) + mask &= sizeof(struct { long x; } __aligned_largest) - 1; + WARN_ON_ONCE((unsigned long)v & mask); + } +#endif +} + /** * instrument_atomic_read - instrument atomic read access * @v: address of access @@ -67,6 +81,7 @@ static __always_inline void instrument_atomic_read(const volatile void *v, size_ { kasan_check_read(v, size); kcsan_check_atomic_read(v, size); + instrument_atomic_check_alignment(v, size); } /** @@ -81,6 +96,7 @@ static __always_inline void instrument_atomic_write(const volatile void *v, size { kasan_check_write(v, size); kcsan_check_atomic_write(v, size); + instrument_atomic_check_alignment(v, size); } /** @@ -95,6 +111,7 @@ static __always_inline void instrument_atomic_read_write(const volatile void *v, { kasan_check_write(v, size); kcsan_check_atomic_read_write(v, size); + instrument_atomic_check_alignment(v, size); } /** diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 60ca6a49839c..3e0f4c990297 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -10,6 +10,7 @@ #define _LINUX_IOPORT_H #ifndef __ASSEMBLY__ +#include <linux/args.h> #include <linux/bits.h> #include <linux/compiler.h> #include <linux/minmax.h> @@ -165,8 +166,12 @@ enum { #define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, IORES_DESC_NONE) -#define DEFINE_RES(_start, _size, _flags) \ +#define __DEFINE_RES0() \ + DEFINE_RES_NAMED(0, 0, NULL, IORESOURCE_UNSET) +#define __DEFINE_RES3(_start, _size, _flags) \ DEFINE_RES_NAMED(_start, _size, NULL, _flags) +#define DEFINE_RES(...) \ + CONCATENATE(__DEFINE_RES, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) #define DEFINE_RES_IO_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_IO) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5b46924fdff5..e5570a16cbb1 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -21,7 +21,6 @@ #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/bitops.h> -#include <linux/hex.h> #include <linux/kstrtox.h> #include <linux/log2.h> #include <linux/math.h> @@ -32,7 +31,7 @@ #include <linux/build_bug.h> #include <linux/sprintf.h> #include <linux/static_call_types.h> -#include <linux/instruction_pointer.h> +#include <linux/trace_printk.h> #include <linux/util_macros.h> #include <linux/wordpart.h> @@ -40,8 +39,6 @@ #include <uapi/linux/kernel.h> -#define STACK_MAGIC 0xdeadbeef - struct completion; struct user; @@ -192,215 +189,9 @@ enum system_states { }; extern enum system_states system_state; -/* - * General tracing related utility functions - trace_printk(), - * tracing_on/tracing_off and tracing_start()/tracing_stop - * - * Use tracing_on/tracing_off when you want to quickly turn on or off - * tracing. It simply enables or disables the recording of the trace events. - * This also corresponds to the user space /sys/kernel/tracing/tracing_on - * file, which gives a means for the kernel and userspace to interact. - * Place a tracing_off() in the kernel where you want tracing to end. - * From user space, examine the trace, and then echo 1 > tracing_on - * to continue tracing. - * - * tracing_stop/tracing_start has slightly more overhead. It is used - * by things like suspend to ram where disabling the recording of the - * trace is not enough, but tracing must actually stop because things - * like calling smp_processor_id() may crash the system. - * - * Most likely, you want to use tracing_on/tracing_off. - */ - -enum ftrace_dump_mode { - DUMP_NONE, - DUMP_ALL, - DUMP_ORIG, - DUMP_PARAM, -}; - -#ifdef CONFIG_TRACING -void tracing_on(void); -void tracing_off(void); -int tracing_is_on(void); -void tracing_snapshot(void); -void tracing_snapshot_alloc(void); - -extern void tracing_start(void); -extern void tracing_stop(void); - -static inline __printf(1, 2) -void ____trace_printk_check_format(const char *fmt, ...) -{ -} -#define __trace_printk_check_format(fmt, args...) \ -do { \ - if (0) \ - ____trace_printk_check_format(fmt, ##args); \ -} while (0) - -/** - * trace_printk - printf formatting in the ftrace buffer - * @fmt: the printf format for printing - * - * Note: __trace_printk is an internal function for trace_printk() and - * the @ip is passed in via the trace_printk() macro. - * - * This function allows a kernel developer to debug fast path sections - * that printk is not appropriate for. By scattering in various - * printk like tracing in the code, a developer can quickly see - * where problems are occurring. - * - * This is intended as a debugging tool for the developer only. - * Please refrain from leaving trace_printks scattered around in - * your code. (Extra memory is used for special buffers that are - * allocated when trace_printk() is used.) - * - * A little optimization trick is done here. If there's only one - * argument, there's no need to scan the string for printf formats. - * The trace_puts() will suffice. But how can we take advantage of - * using trace_puts() when trace_printk() has only one argument? - * By stringifying the args and checking the size we can tell - * whether or not there are args. __stringify((__VA_ARGS__)) will - * turn into "()\0" with a size of 3 when there are no args, anything - * else will be bigger. All we need to do is define a string to this, - * and then take its size and compare to 3. If it's bigger, use - * do_trace_printk() otherwise, optimize it to trace_puts(). Then just - * let gcc optimize the rest. - */ - -#define trace_printk(fmt, ...) \ -do { \ - char _______STR[] = __stringify((__VA_ARGS__)); \ - if (sizeof(_______STR) > 3) \ - do_trace_printk(fmt, ##__VA_ARGS__); \ - else \ - trace_puts(fmt); \ -} while (0) - -#define do_trace_printk(fmt, args...) \ -do { \ - static const char *trace_printk_fmt __used \ - __section("__trace_printk_fmt") = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_printk_check_format(fmt, ##args); \ - \ - if (__builtin_constant_p(fmt)) \ - __trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args); \ - else \ - __trace_printk(_THIS_IP_, fmt, ##args); \ -} while (0) - -extern __printf(2, 3) -int __trace_bprintk(unsigned long ip, const char *fmt, ...); - -extern __printf(2, 3) -int __trace_printk(unsigned long ip, const char *fmt, ...); - -/** - * trace_puts - write a string into the ftrace buffer - * @str: the string to record - * - * Note: __trace_bputs is an internal function for trace_puts and - * the @ip is passed in via the trace_puts macro. - * - * This is similar to trace_printk() but is made for those really fast - * paths that a developer wants the least amount of "Heisenbug" effects, - * where the processing of the print format is still too much. - * - * This function allows a kernel developer to debug fast path sections - * that printk is not appropriate for. By scattering in various - * printk like tracing in the code, a developer can quickly see - * where problems are occurring. - * - * This is intended as a debugging tool for the developer only. - * Please refrain from leaving trace_puts scattered around in - * your code. (Extra memory is used for special buffers that are - * allocated when trace_puts() is used.) - * - * Returns: 0 if nothing was written, positive # if string was. - * (1 when __trace_bputs is used, strlen(str) when __trace_puts is used) - */ - -#define trace_puts(str) ({ \ - static const char *trace_printk_fmt __used \ - __section("__trace_printk_fmt") = \ - __builtin_constant_p(str) ? str : NULL; \ - \ - if (__builtin_constant_p(str)) \ - __trace_bputs(_THIS_IP_, trace_printk_fmt); \ - else \ - __trace_puts(_THIS_IP_, str, strlen(str)); \ -}) -extern int __trace_bputs(unsigned long ip, const char *str); -extern int __trace_puts(unsigned long ip, const char *str, int size); - -extern void trace_dump_stack(int skip); - -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement. - */ -#define ftrace_vprintk(fmt, vargs) \ -do { \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt __used \ - __section("__trace_printk_fmt") = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \ - } else \ - __ftrace_vprintk(_THIS_IP_, fmt, vargs); \ -} while (0) - -extern __printf(2, 0) int -__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap); - -extern __printf(2, 0) int -__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); - -extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode); -#else -static inline void tracing_start(void) { } -static inline void tracing_stop(void) { } -static inline void trace_dump_stack(int skip) { } - -static inline void tracing_on(void) { } -static inline void tracing_off(void) { } -static inline int tracing_is_on(void) { return 0; } -static inline void tracing_snapshot(void) { } -static inline void tracing_snapshot_alloc(void) { } - -static inline __printf(1, 2) -int trace_printk(const char *fmt, ...) -{ - return 0; -} -static __printf(1, 0) inline int -ftrace_vprintk(const char *fmt, va_list ap) -{ - return 0; -} -static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } -#endif /* CONFIG_TRACING */ - /* Rebuild everything on CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_DYNAMIC_FTRACE # define REBUILD_DUE_TO_DYNAMIC_FTRACE #endif -/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */ -#define VERIFY_OCTAL_PERMISSIONS(perms) \ - (BUILD_BUG_ON_ZERO((perms) < 0) + \ - BUILD_BUG_ON_ZERO((perms) > 0777) + \ - /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */ \ - BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) + \ - BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) + \ - /* USER_WRITABLE >= GROUP_WRITABLE */ \ - BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) + \ - /* OTHER_WRITABLE? Generally considered a bad idea. */ \ - BUILD_BUG_ON_ZERO((perms) & 2) + \ - (perms)) #endif diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 5f7b9de97e8d..ac4129d1d741 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -11,49 +11,26 @@ struct kho_scratch { phys_addr_t size; }; +struct kho_vmalloc; + struct folio; struct page; -#define DECLARE_KHOSER_PTR(name, type) \ - union { \ - phys_addr_t phys; \ - type ptr; \ - } name -#define KHOSER_STORE_PTR(dest, val) \ - ({ \ - typeof(val) v = val; \ - typecheck(typeof((dest).ptr), v); \ - (dest).phys = virt_to_phys(v); \ - }) -#define KHOSER_LOAD_PTR(src) \ - ({ \ - typeof(src) s = src; \ - (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ - }) - -struct kho_vmalloc_chunk; -struct kho_vmalloc { - DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *); - unsigned int total_pages; - unsigned short flags; - unsigned short order; -}; - #ifdef CONFIG_KEXEC_HANDOVER bool kho_is_enabled(void); bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); void kho_unpreserve_folio(struct folio *folio); -int kho_preserve_pages(struct page *page, unsigned int nr_pages); -void kho_unpreserve_pages(struct page *page, unsigned int nr_pages); +int kho_preserve_pages(struct page *page, unsigned long nr_pages); +void kho_unpreserve_pages(struct page *page, unsigned long nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); void *kho_alloc_preserve(size_t size); void kho_unpreserve_free(void *mem); void kho_restore_free(void *mem); struct folio *kho_restore_folio(phys_addr_t phys); -struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); +struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); int kho_add_subtree(const char *name, void *fdt); void kho_remove_subtree(void *fdt); diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h new file mode 100644 index 000000000000..2201a0d2c159 --- /dev/null +++ b/include/linux/kho/abi/kexec_handover.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/* + * Copyright (C) 2023 Alexander Graf <graf@amazon.com> + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> + * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> + * Copyright (C) 2025 Google LLC, Jason Miu <jasonmiu@google.com> + */ + +#ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H +#define _LINUX_KHO_ABI_KEXEC_HANDOVER_H + +#include <linux/types.h> + +/** + * DOC: Kexec Handover ABI + * + * Kexec Handover uses the ABI defined below for passing preserved data from + * one kernel to the next. + * The ABI uses Flattened Device Tree (FDT) format. The first kernel creates an + * FDT which is then passed to the next kernel during a kexec handover. + * + * This interface is a contract. Any modification to the FDT structure, node + * properties, compatible string, or the layout of the data structures + * referenced here constitutes a breaking change. Such changes require + * incrementing the version number in KHO_FDT_COMPATIBLE to prevent a new kernel + * from misinterpreting data from an older kernel. Changes are allowed provided + * the compatibility version is incremented. However, backward/forward + * compatibility is only guaranteed for kernels supporting the same ABI version. + * + * FDT Structure Overview: + * The FDT serves as a central registry for physical + * addresses of preserved data structures and sub-FDTs. The first kernel + * populates this FDT with references to memory regions and other FDTs that + * need to persist across the kexec transition. The subsequent kernel then + * parses this FDT to locate and restore the preserved data.:: + * + * / { + * compatible = "kho-v1"; + * + * preserved-memory-map = <0x...>; + * + * <subnode-name-1> { + * fdt = <0x...>; + * }; + * + * <subnode-name-2> { + * fdt = <0x...>; + * }; + * ... ... + * <subnode-name-N> { + * fdt = <0x...>; + * }; + * }; + * + * Root KHO Node (/): + * - compatible: "kho-v1" + * + * Indentifies the overall KHO ABI version. + * + * - preserved-memory-map: u64 + * + * Physical memory address pointing to the root of the + * preserved memory map data structure. + * + * Subnodes (<subnode-name-N>): + * Subnodes can also be added to the root node to + * describe other preserved data blobs. The <subnode-name-N> + * is provided by the subsystem that uses KHO for preserving its + * data. + * + * - fdt: u64 + * + * Physical address pointing to a subnode FDT blob that is also + * being preserved. + */ + +/* The compatible string for the KHO FDT root node. */ +#define KHO_FDT_COMPATIBLE "kho-v1" + +/* The FDT property for the preserved memory map. */ +#define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map" + +/* The FDT property for sub-FDTs. */ +#define KHO_FDT_SUB_TREE_PROP_NAME "fdt" + +/** + * DOC: Kexec Handover ABI for vmalloc Preservation + * + * The Kexec Handover ABI for preserving vmalloc'ed memory is defined by + * a set of structures and helper macros. The layout of these structures is a + * stable contract between kernels and is versioned by the KHO_FDT_COMPATIBLE + * string. + * + * The preservation is managed through a main descriptor &struct kho_vmalloc, + * which points to a linked list of &struct kho_vmalloc_chunk structures. These + * chunks contain the physical addresses of the preserved pages, allowing the + * next kernel to reconstruct the vmalloc area with the same content and layout. + * Helper macros are also defined for storing and loading pointers within + * these structures. + */ + +/* Helper macro to define a union for a serializable pointer. */ +#define DECLARE_KHOSER_PTR(name, type) \ + union { \ + u64 phys; \ + type ptr; \ + } name + +/* Stores the physical address of a serializable pointer. */ +#define KHOSER_STORE_PTR(dest, val) \ + ({ \ + typeof(val) v = val; \ + typecheck(typeof((dest).ptr), v); \ + (dest).phys = virt_to_phys(v); \ + }) + +/* Loads the stored physical address back to a pointer. */ +#define KHOSER_LOAD_PTR(src) \ + ({ \ + typeof(src) s = src; \ + (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ + }) + +/* + * This header is embedded at the beginning of each `kho_vmalloc_chunk` + * and contains a pointer to the next chunk in the linked list, + * stored as a physical address for handover. + */ +struct kho_vmalloc_hdr { + DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); +}; + +#define KHO_VMALLOC_SIZE \ + ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ + sizeof(u64)) + +/* + * Each chunk is a single page and is part of a linked list that describes + * a preserved vmalloc area. It contains the header with the link to the next + * chunk and a zero terminated array of physical addresses of the pages that + * make up the preserved vmalloc area. + */ +struct kho_vmalloc_chunk { + struct kho_vmalloc_hdr hdr; + u64 phys[KHO_VMALLOC_SIZE]; +}; + +static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); + +/* + * Describes a preserved vmalloc memory area, including the + * total number of pages, allocation flags, page order, and a pointer to the + * first chunk of physical page addresses. + */ +struct kho_vmalloc { + DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *); + unsigned int total_pages; + unsigned short flags; + unsigned short order; +}; + +#endif /* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */ diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h index bb099c92e469..46750a0ddf88 100644 --- a/include/linux/kho/abi/luo.h +++ b/include/linux/kho/abi/luo.h @@ -8,10 +8,10 @@ /** * DOC: Live Update Orchestrator ABI * - * This header defines the stable Application Binary Interface used by the - * Live Update Orchestrator to pass state from a pre-update kernel to a - * post-update kernel. The ABI is built upon the Kexec HandOver framework - * and uses a Flattened Device Tree to describe the preserved data. + * Live Update Orchestrator uses the stable Application Binary Interface + * defined below to pass state from a pre-update kernel to a post-update + * kernel. The ABI is built upon the Kexec HandOver framework and uses a + * Flattened Device Tree to describe the preserved data. * * This interface is a contract. Any modification to the FDT structure, node * properties, compatible strings, or the layout of the `__packed` serialization @@ -37,6 +37,11 @@ * compatible = "luo-session-v1"; * luo-session-header = <phys_addr_of_session_header_ser>; * }; + * + * luo-flb { + * compatible = "luo-flb-v1"; + * luo-flb-header = <phys_addr_of_flb_header_ser>; + * }; * }; * * Main LUO Node (/): @@ -56,6 +61,17 @@ * is the header for a contiguous block of memory containing an array of * `struct luo_session_ser`, one for each preserved session. * + * File-Lifecycle-Bound Node (luo-flb): + * This node describes all preserved global objects whose lifecycle is bound + * to that of the preserved files (e.g., shared IOMMU state). + * + * - compatible: "luo-flb-v1" + * Identifies the FLB ABI version. + * - luo-flb-header: u64 + * The physical address of a `struct luo_flb_header_ser`. This structure is + * the header for a contiguous block of memory containing an array of + * `struct luo_flb_ser`, one for each preserved global object. + * * Serialization Structures: * The FDT properties point to memory regions containing arrays of simple, * `__packed` structures. These structures contain the actual preserved state. @@ -74,6 +90,16 @@ * Metadata for a single preserved file. Contains the `compatible` string to * find the correct handler in the new kernel, a user-provided `token` for * identification, and an opaque `data` handle for the handler to use. + * + * - struct luo_flb_header_ser: + * Header for the FLB array. Contains the total page count of the + * preserved memory block and the number of `struct luo_flb_ser` entries + * that follow. + * + * - struct luo_flb_ser: + * Metadata for a single preserved global object. Contains its `name` + * (compatible string), an opaque `data` handle, and the `count` + * number of files depending on it. */ #ifndef _LINUX_KHO_ABI_LUO_H @@ -163,4 +189,59 @@ struct luo_session_ser { struct luo_file_set_ser file_set_ser; } __packed; +/* The max size is set so it can be reliably used during in serialization */ +#define LIVEUPDATE_FLB_COMPAT_LENGTH 48 + +#define LUO_FDT_FLB_NODE_NAME "luo-flb" +#define LUO_FDT_FLB_COMPATIBLE "luo-flb-v1" +#define LUO_FDT_FLB_HEADER "luo-flb-header" + +/** + * struct luo_flb_header_ser - Header for the serialized FLB data block. + * @pgcnt: The total number of pages occupied by the entire preserved memory + * region, including this header and the subsequent array of + * &struct luo_flb_ser entries. + * @count: The number of &struct luo_flb_ser entries that follow this header + * in the memory block. + * + * This structure is located at the physical address specified by the + * `LUO_FDT_FLB_HEADER` FDT property. It provides the new kernel with the + * necessary information to find and iterate over the array of preserved + * File-Lifecycle-Bound objects and to manage the underlying memory. + * + * If this structure is modified, LUO_FDT_FLB_COMPATIBLE must be updated. + */ +struct luo_flb_header_ser { + u64 pgcnt; + u64 count; +} __packed; + +/** + * struct luo_flb_ser - Represents the serialized state of a single FLB object. + * @name: The unique compatibility string of the FLB object, used to find the + * corresponding &struct liveupdate_flb handler in the new kernel. + * @data: The opaque u64 handle returned by the FLB's .preserve() operation + * in the old kernel. This handle encapsulates the entire state needed + * for restoration. + * @count: The reference count at the time of serialization; i.e., the number + * of preserved files that depended on this FLB. This is used by the + * new kernel to correctly manage the FLB's lifecycle. + * + * An array of these structures is created in a preserved memory region and + * passed to the new kernel. Each entry allows the LUO core to restore one + * global, shared object. + * + * If this structure is modified, LUO_FDT_FLB_COMPATIBLE must be updated. + */ +struct luo_flb_ser { + char name[LIVEUPDATE_FLB_COMPAT_LENGTH]; + u64 data; + u64 count; +} __packed; + +/* Kernel Live Update Test ABI */ +#ifdef CONFIG_LIVEUPDATE_TEST +#define LIVEUPDATE_TEST_FLB_COMPATIBLE(i) "liveupdate-test-flb-v" #i +#endif + #endif /* _LINUX_KHO_ABI_LUO_H */ diff --git a/include/linux/kho/abi/memblock.h b/include/linux/kho/abi/memblock.h new file mode 100644 index 000000000000..27b042f470e1 --- /dev/null +++ b/include/linux/kho/abi/memblock.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_KHO_ABI_MEMBLOCK_H +#define _LINUX_KHO_ABI_MEMBLOCK_H + +/** + * DOC: memblock kexec handover ABI + * + * Memblock can serialize its current memory reservations created with + * reserve_mem command line option across kexec through KHO. + * The post-KHO kernel can then consume these reservations and they are + * guaranteed to have the same physical address. + * + * The state is serialized using Flattened Device Tree (FDT) format. Any + * modification to the FDT structure, node properties, or the compatible + * strings constitutes a breaking change. Such changes require incrementing the + * version number in the relevant `_COMPATIBLE` string to prevent a new kernel + * from misinterpreting data from an old kernel. + * + * Changes are allowed provided the compatibility version is incremented. + * However, backward/forward compatibility is only guaranteed for kernels + * supporting the same ABI version. + * + * FDT Structure Overview: + * The entire memblock state is encapsulated within a single KHO entry named + * "memblock". + * This entry contains an FDT with the following layout: + * + * .. code-block:: none + * + * / { + * compatible = "memblock-v1"; + * + * n1 { + * compatible = "reserve-mem-v1"; + * start = <0xc06b 0x4000000>; + * size = <0x04 0x00>; + * }; + * }; + * + * Main memblock node (/): + * + * - compatible: "memblock-v1" + + * Identifies the overall memblock ABI version. + * + * reserved_mem node: + * These nodes describe all reserve_mem regions. The node name is the name + * defined by the user for a reserve_mem region. + * + * - compatible: "reserve-mem-v1" + * + * Identifies the ABI version of reserve_mem descriptions + * + * - start: u64 + * + * Physical address of the reserved memory region. + * + * - size: u64 + * + * size in bytes of the reserved memory region. + */ + +/* Top level memblock FDT node name. */ +#define MEMBLOCK_KHO_FDT "memblock" + +/* The compatible string for the memblock FDT root node. */ +#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" + +/* The compatible string for the reserve_mem FDT nodes. */ +#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" + +#endif /* _LINUX_KHO_ABI_MEMBLOCK_H */ diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h index da7d063474a1..68cb6303b846 100644 --- a/include/linux/kho/abi/memfd.h +++ b/include/linux/kho/abi/memfd.h @@ -12,13 +12,13 @@ #define _LINUX_KHO_ABI_MEMFD_H #include <linux/types.h> -#include <linux/kexec_handover.h> +#include <linux/kho/abi/kexec_handover.h> /** * DOC: memfd Live Update ABI * - * This header defines the ABI for preserving the state of a memfd across a - * kexec reboot using the LUO. + * memfd uses the ABI defined below for preserving its state across a kexec + * reboot using the LUO. * * The state is serialized into a packed structure `struct memfd_luo_ser` * which is handed over to the next kernel via the KHO mechanism. diff --git a/include/linux/list_private.h b/include/linux/list_private.h new file mode 100644 index 000000000000..19b01d16beda --- /dev/null +++ b/include/linux/list_private.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ +#ifndef _LINUX_LIST_PRIVATE_H +#define _LINUX_LIST_PRIVATE_H + +/** + * DOC: Private List Primitives + * + * Provides a set of list primitives identical in function to those in + * ``<linux/list.h>``, but designed for cases where the embedded + * ``&struct list_head`` is private member. + */ + +#include <linux/compiler.h> +#include <linux/list.h> + +#define __list_private_offset(type, member) \ + ((size_t)(&ACCESS_PRIVATE(((type *)0), member))) + +/** + * list_private_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the identifier passed to ACCESS_PRIVATE. + */ +#define list_private_entry(ptr, type, member) ({ \ + const struct list_head *__mptr = (ptr); \ + (type *)((char *)__mptr - __list_private_offset(type, member)); \ +}) + +/** + * list_private_first_entry - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the identifier passed to ACCESS_PRIVATE. + */ +#define list_private_first_entry(ptr, type, member) \ + list_private_entry((ptr)->next, type, member) + +/** + * list_private_last_entry - get the last element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the identifier passed to ACCESS_PRIVATE. + */ +#define list_private_last_entry(ptr, type, member) \ + list_private_entry((ptr)->prev, type, member) + +/** + * list_private_next_entry - get the next element in list + * @pos: the type * to cursor + * @member: the name of the list_head within the struct. + */ +#define list_private_next_entry(pos, member) \ + list_private_entry(ACCESS_PRIVATE(pos, member).next, typeof(*(pos)), member) + +/** + * list_private_next_entry_circular - get the next element in list + * @pos: the type * to cursor. + * @head: the list head to take the element from. + * @member: the name of the list_head within the struct. + * + * Wraparound if pos is the last element (return the first element). + * Note, that list is expected to be not empty. + */ +#define list_private_next_entry_circular(pos, head, member) \ + (list_is_last(&ACCESS_PRIVATE(pos, member), head) ? \ + list_private_first_entry(head, typeof(*(pos)), member) : \ + list_private_next_entry(pos, member)) + +/** + * list_private_prev_entry - get the prev element in list + * @pos: the type * to cursor + * @member: the name of the list_head within the struct. + */ +#define list_private_prev_entry(pos, member) \ + list_private_entry(ACCESS_PRIVATE(pos, member).prev, typeof(*(pos)), member) + +/** + * list_private_prev_entry_circular - get the prev element in list + * @pos: the type * to cursor. + * @head: the list head to take the element from. + * @member: the name of the list_head within the struct. + * + * Wraparound if pos is the first element (return the last element). + * Note, that list is expected to be not empty. + */ +#define list_private_prev_entry_circular(pos, head, member) \ + (list_is_first(&ACCESS_PRIVATE(pos, member), head) ? \ + list_private_last_entry(head, typeof(*(pos)), member) : \ + list_private_prev_entry(pos, member)) + +/** + * list_private_entry_is_head - test if the entry points to the head of the list + * @pos: the type * to cursor + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_private_entry_is_head(pos, head, member) \ + list_is_head(&ACCESS_PRIVATE(pos, member), (head)) + +/** + * list_private_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_private_for_each_entry(pos, head, member) \ + for (pos = list_private_first_entry(head, typeof(*pos), member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = list_private_next_entry(pos, member)) + +/** + * list_private_for_each_entry_reverse - iterate backwards over list of given type. + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_private_for_each_entry_reverse(pos, head, member) \ + for (pos = list_private_last_entry(head, typeof(*pos), member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = list_private_prev_entry(pos, member)) + +/** + * list_private_for_each_entry_continue - continue iteration over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define list_private_for_each_entry_continue(pos, head, member) \ + for (pos = list_private_next_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = list_private_next_entry(pos, member)) + +/** + * list_private_for_each_entry_continue_reverse - iterate backwards from the given point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Start to iterate over list of given type backwards, continuing after + * the current position. + */ +#define list_private_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_private_prev_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = list_private_prev_entry(pos, member)) + +/** + * list_private_for_each_entry_from - iterate over list of given type from the current point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate over list of given type, continuing from current position. + */ +#define list_private_for_each_entry_from(pos, head, member) \ + for (; !list_private_entry_is_head(pos, head, member); \ + pos = list_private_next_entry(pos, member)) + +/** + * list_private_for_each_entry_from_reverse - iterate backwards over list of given type + * from the current point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type, continuing from current position. + */ +#define list_private_for_each_entry_from_reverse(pos, head, member) \ + for (; !list_private_entry_is_head(pos, head, member); \ + pos = list_private_prev_entry(pos, member)) + +/** + * list_private_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_private_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_private_first_entry(head, typeof(*pos), member), \ + n = list_private_next_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = n, n = list_private_next_entry(n, member)) + +/** + * list_private_for_each_entry_safe_continue - continue list iteration safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate over list of given type, continuing after current point, + * safe against removal of list entry. + */ +#define list_private_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_private_next_entry(pos, member), \ + n = list_private_next_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = n, n = list_private_next_entry(n, member)) + +/** + * list_private_for_each_entry_safe_from - iterate over list from current point safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate over list of given type from current point, safe against + * removal of list entry. + */ +#define list_private_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_private_next_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = n, n = list_private_next_entry(n, member)) + +/** + * list_private_for_each_entry_safe_reverse - iterate backwards over list safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type, safe against removal + * of list entry. + */ +#define list_private_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_private_last_entry(head, typeof(*pos), member), \ + n = list_private_prev_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = n, n = list_private_prev_entry(n, member)) + +/** + * list_private_safe_reset_next - reset a stale list_for_each_entry_safe loop + * @pos: the loop cursor used in the list_for_each_entry_safe loop + * @n: temporary storage used in list_for_each_entry_safe + * @member: the name of the list_head within the struct. + * + * list_safe_reset_next is not safe to use in general if the list may be + * modified concurrently (eg. the lock is dropped in the loop body). An + * exception to this is if the cursor element (pos) is pinned in the list, + * and list_safe_reset_next is called after re-taking the lock and before + * completing the current iteration of the loop body. + */ +#define list_private_safe_reset_next(pos, n, member) \ + n = list_private_next_entry(pos, member) + +#endif /* _LINUX_LIST_PRIVATE_H */ diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index a7f6ee5b6771..fe82a6c3005f 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -11,10 +11,13 @@ #include <linux/compiler.h> #include <linux/kho/abi/luo.h> #include <linux/list.h> +#include <linux/mutex.h> #include <linux/types.h> #include <uapi/linux/liveupdate.h> struct liveupdate_file_handler; +struct liveupdate_flb; +struct liveupdate_session; struct file; /** @@ -99,6 +102,118 @@ struct liveupdate_file_handler { * registered file handlers. */ struct list_head __private list; + /* A list of FLB dependencies. */ + struct list_head __private flb_list; +}; + +/** + * struct liveupdate_flb_op_args - Arguments for FLB operation callbacks. + * @flb: The global FLB instance for which this call is performed. + * @data: For .preserve(): [OUT] The callback sets this field. + * For .unpreserve(): [IN] The handle from .preserve(). + * For .retrieve(): [IN] The handle from .preserve(). + * @obj: For .preserve(): [OUT] Sets this to the live object. + * For .retrieve(): [OUT] Sets this to the live object. + * For .finish(): [IN] The live object from .retrieve(). + * + * This structure bundles all parameters for the FLB operation callbacks. + */ +struct liveupdate_flb_op_args { + struct liveupdate_flb *flb; + u64 data; + void *obj; +}; + +/** + * struct liveupdate_flb_ops - Callbacks for global File-Lifecycle-Bound data. + * @preserve: Called when the first file using this FLB is preserved. + * The callback must save its state and return a single, + * self-contained u64 handle by setting the 'argp->data' + * field and 'argp->obj'. + * @unpreserve: Called when the last file using this FLB is unpreserved + * (aborted before reboot). Receives the handle via + * 'argp->data' and live object via 'argp->obj'. + * @retrieve: Called on-demand in the new kernel, the first time a + * component requests access to the shared object. It receives + * the preserved handle via 'argp->data' and must reconstruct + * the live object, returning it by setting the 'argp->obj' + * field. + * @finish: Called in the new kernel when the last file using this FLB + * is finished. Receives the live object via 'argp->obj' for + * cleanup. + * @owner: Module reference + * + * Operations that manage global shared data with file bound lifecycle, + * triggered by the first file that uses it and concluded by the last file that + * uses it, across all sessions. + */ +struct liveupdate_flb_ops { + int (*preserve)(struct liveupdate_flb_op_args *argp); + void (*unpreserve)(struct liveupdate_flb_op_args *argp); + int (*retrieve)(struct liveupdate_flb_op_args *argp); + void (*finish)(struct liveupdate_flb_op_args *argp); + struct module *owner; +}; + +/* + * struct luo_flb_private_state - Private FLB state structures. + * @count: The number of preserved files currently depending on this FLB. + * This is used to trigger the preserve/unpreserve/finish ops on the + * first/last file. + * @data: The opaque u64 handle returned by .preserve() or passed to + * .retrieve(). + * @obj: The live kernel object returned by .preserve() or .retrieve(). + * @lock: A mutex that protects all fields within this structure, providing + * the synchronization service for the FLB's ops. + * @finished: True once the FLB's finish() callback has run. + * @retrieved: True once the FLB's retrieve() callback has run. + */ +struct luo_flb_private_state { + long count; + u64 data; + void *obj; + struct mutex lock; + bool finished; + bool retrieved; +}; + +/* + * struct luo_flb_private - Keep separate incoming and outgoing states. + * @list: A global list of registered FLBs. + * @outgoing: The runtime state for the pre-reboot + * (preserve/unpreserve) lifecycle. + * @incoming: The runtime state for the post-reboot (retrieve/finish) + * lifecycle. + * @users: With how many File-Handlers this FLB is registered. + * @initialized: true when private fields have been initialized. + */ +struct luo_flb_private { + struct list_head list; + struct luo_flb_private_state outgoing; + struct luo_flb_private_state incoming; + int users; + bool initialized; +}; + +/** + * struct liveupdate_flb - A global definition for a shared data object. + * @ops: Callback functions + * @compatible: The compatibility string (e.g., "iommu-core-v1" + * that uniquely identifies the FLB type this handler + * supports. This is matched against the compatible string + * associated with individual &struct liveupdate_flb + * instances. + * + * This struct is the "template" that a driver registers to define a shared, + * file-lifecycle-bound object. The actual runtime state (the live object, + * refcount, etc.) is managed privately by the LUO core. + */ +struct liveupdate_flb { + const struct liveupdate_flb_ops *ops; + const char compatible[LIVEUPDATE_FLB_COMPAT_LENGTH]; + + /* private: */ + struct luo_flb_private __private private; }; #ifdef CONFIG_LIVEUPDATE @@ -112,6 +227,14 @@ int liveupdate_reboot(void); int liveupdate_register_file_handler(struct liveupdate_file_handler *fh); int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); +int liveupdate_register_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb); +int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb); + +int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp); +int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp); + #else /* CONFIG_LIVEUPDATE */ static inline bool liveupdate_enabled(void) @@ -134,5 +257,29 @@ static inline int liveupdate_unregister_file_handler(struct liveupdate_file_hand return -EOPNOTSUPP; } +static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + return -EOPNOTSUPP; +} + +static inline int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + return -EOPNOTSUPP; +} + +static inline int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, + void **objp) +{ + return -EOPNOTSUPP; +} + +static inline int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, + void **objp) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_LIVEUPDATE */ #endif /* _LINUX_LIVEUPDATE_H */ diff --git a/include/linux/log2.h b/include/linux/log2.h index 2eac3fc9303d..e17ceb32e0c9 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -44,7 +44,7 @@ int __ilog2_u64(u64 n) static __always_inline __attribute__((const)) bool is_power_of_2(unsigned long n) { - return (n != 0 && ((n & (n - 1)) == 0)); + return n - 1 < (n ^ (n - 1)); } /** diff --git a/include/linux/module.h b/include/linux/module.h index 20ddfd97630d..14f391b186c6 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -742,6 +742,15 @@ static inline void __module_get(struct module *module) __mod ? __mod->name : "kernel"; \ }) +static inline const unsigned char *module_buildid(struct module *mod) +{ +#ifdef CONFIG_STACKTRACE_BUILD_ID + return mod->build_id; +#else + return NULL; +#endif +} + /* Dereference module function descriptor */ void *dereference_module_function_descriptor(struct module *mod, void *ptr); diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index c03db3c2fd40..7d22d4c4ea2e 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -2,9 +2,14 @@ #ifndef _LINUX_MODULE_PARAMS_H #define _LINUX_MODULE_PARAMS_H /* (C) Copyright 2001, 2002 Rusty Russell IBM Corporation */ + +#include <linux/array_size.h> +#include <linux/build_bug.h> +#include <linux/compiler.h> #include <linux/init.h> #include <linux/stringify.h> -#include <linux/kernel.h> +#include <linux/sysfs.h> +#include <linux/types.h> /* * The maximum module name length, including the NUL byte. diff --git a/include/linux/panic.h b/include/linux/panic.h index a00bc0937698..f1dd417e54b2 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -41,6 +41,14 @@ void abort(void); * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec(). */ extern atomic_t panic_cpu; + +/* + * panic_redirect_cpu is used when panic is redirected to a specific CPU via + * the panic_force_cpu= boot parameter. It holds the CPU number that originally + * triggered the panic before redirection. A value of PANIC_CPU_INVALID means + * no redirection has occurred. + */ +extern atomic_t panic_redirect_cpu; #define PANIC_CPU_INVALID -1 bool panic_try_start(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index 873e400aafce..074ad4ef3d81 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -49,6 +49,7 @@ #include <linux/tracepoint-defs.h> #include <linux/unwind_deferred_types.h> #include <asm/kmap_size.h> +#include <linux/time64.h> #ifndef COMPILE_OFFSETS #include <generated/rq-offsets.h> #endif @@ -86,6 +87,7 @@ struct signal_struct; struct task_delay_info; struct task_group; struct task_struct; +struct timespec64; struct user_event_mm; #include <linux/sched/ext.h> @@ -435,6 +437,9 @@ struct sched_info { /* When were we last queued to run? */ unsigned long long last_queued; + /* Timestamp of max time spent waiting on a runqueue: */ + struct timespec64 max_run_delay_ts; + #endif /* CONFIG_SCHED_INFO */ }; diff --git a/include/linux/smp.h b/include/linux/smp.h index 91d0ecf3b8d3..1ebd88026119 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -62,6 +62,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd); void __noreturn panic_smp_self_stop(void); void __noreturn nmi_panic_self_stop(struct pt_regs *regs); void crash_smp_send_stop(void); +int panic_smp_redirect_cpu(int target_cpu, void *msg); /* * Call a function on all processors diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index c33a96b7391a..99b775f3ff46 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -808,4 +808,17 @@ static inline void sysfs_put(struct kernfs_node *kn) kernfs_put(kn); } +/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */ +#define VERIFY_OCTAL_PERMISSIONS(perms) \ + (BUILD_BUG_ON_ZERO((perms) < 0) + \ + BUILD_BUG_ON_ZERO((perms) > 0777) + \ + /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */ \ + BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) + \ + BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) + \ + /* USER_WRITABLE >= GROUP_WRITABLE */ \ + BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) + \ + /* OTHER_WRITABLE? Generally considered a bad idea. */ \ + BUILD_BUG_ON_ZERO((perms) & 2) + \ + (perms)) + #endif /* _SYSFS_H_ */ diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h new file mode 100644 index 000000000000..bb5874097f24 --- /dev/null +++ b/include/linux/trace_printk.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TRACE_PRINTK_H +#define _LINUX_TRACE_PRINTK_H + +#include <linux/compiler_attributes.h> +#include <linux/instruction_pointer.h> +#include <linux/stddef.h> +#include <linux/stringify.h> + +/* + * General tracing related utility functions - trace_printk(), + * tracing_on/tracing_off and tracing_start()/tracing_stop + * + * Use tracing_on/tracing_off when you want to quickly turn on or off + * tracing. It simply enables or disables the recording of the trace events. + * This also corresponds to the user space /sys/kernel/tracing/tracing_on + * file, which gives a means for the kernel and userspace to interact. + * Place a tracing_off() in the kernel where you want tracing to end. + * From user space, examine the trace, and then echo 1 > tracing_on + * to continue tracing. + * + * tracing_stop/tracing_start has slightly more overhead. It is used + * by things like suspend to ram where disabling the recording of the + * trace is not enough, but tracing must actually stop because things + * like calling smp_processor_id() may crash the system. + * + * Most likely, you want to use tracing_on/tracing_off. + */ + +enum ftrace_dump_mode { + DUMP_NONE, + DUMP_ALL, + DUMP_ORIG, + DUMP_PARAM, +}; + +#ifdef CONFIG_TRACING +void tracing_on(void); +void tracing_off(void); +int tracing_is_on(void); +void tracing_snapshot(void); +void tracing_snapshot_alloc(void); + +extern void tracing_start(void); +extern void tracing_stop(void); + +static inline __printf(1, 2) +void ____trace_printk_check_format(const char *fmt, ...) +{ +} +#define __trace_printk_check_format(fmt, args...) \ +do { \ + if (0) \ + ____trace_printk_check_format(fmt, ##args); \ +} while (0) + +/** + * trace_printk - printf formatting in the ftrace buffer + * @fmt: the printf format for printing + * + * Note: __trace_printk is an internal function for trace_printk() and + * the @ip is passed in via the trace_printk() macro. + * + * This function allows a kernel developer to debug fast path sections + * that printk is not appropriate for. By scattering in various + * printk like tracing in the code, a developer can quickly see + * where problems are occurring. + * + * This is intended as a debugging tool for the developer only. + * Please refrain from leaving trace_printks scattered around in + * your code. (Extra memory is used for special buffers that are + * allocated when trace_printk() is used.) + * + * A little optimization trick is done here. If there's only one + * argument, there's no need to scan the string for printf formats. + * The trace_puts() will suffice. But how can we take advantage of + * using trace_puts() when trace_printk() has only one argument? + * By stringifying the args and checking the size we can tell + * whether or not there are args. __stringify((__VA_ARGS__)) will + * turn into "()\0" with a size of 3 when there are no args, anything + * else will be bigger. All we need to do is define a string to this, + * and then take its size and compare to 3. If it's bigger, use + * do_trace_printk() otherwise, optimize it to trace_puts(). Then just + * let gcc optimize the rest. + */ + +#define trace_printk(fmt, ...) \ +do { \ + char _______STR[] = __stringify((__VA_ARGS__)); \ + if (sizeof(_______STR) > 3) \ + do_trace_printk(fmt, ##__VA_ARGS__); \ + else \ + trace_puts(fmt); \ +} while (0) + +#define do_trace_printk(fmt, args...) \ +do { \ + static const char *trace_printk_fmt __used \ + __section("__trace_printk_fmt") = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_printk_check_format(fmt, ##args); \ + \ + if (__builtin_constant_p(fmt)) \ + __trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args); \ + else \ + __trace_printk(_THIS_IP_, fmt, ##args); \ +} while (0) + +extern __printf(2, 3) +int __trace_bprintk(unsigned long ip, const char *fmt, ...); + +extern __printf(2, 3) +int __trace_printk(unsigned long ip, const char *fmt, ...); + +/** + * trace_puts - write a string into the ftrace buffer + * @str: the string to record + * + * Note: __trace_bputs is an internal function for trace_puts and + * the @ip is passed in via the trace_puts macro. + * + * This is similar to trace_printk() but is made for those really fast + * paths that a developer wants the least amount of "Heisenbug" effects, + * where the processing of the print format is still too much. + * + * This function allows a kernel developer to debug fast path sections + * that printk is not appropriate for. By scattering in various + * printk like tracing in the code, a developer can quickly see + * where problems are occurring. + * + * This is intended as a debugging tool for the developer only. + * Please refrain from leaving trace_puts scattered around in + * your code. (Extra memory is used for special buffers that are + * allocated when trace_puts() is used.) + * + * Returns: 0 if nothing was written, positive # if string was. + * (1 when __trace_bputs is used, strlen(str) when __trace_puts is used) + */ + +#define trace_puts(str) ({ \ + static const char *trace_printk_fmt __used \ + __section("__trace_printk_fmt") = \ + __builtin_constant_p(str) ? str : NULL; \ + \ + if (__builtin_constant_p(str)) \ + __trace_bputs(_THIS_IP_, trace_printk_fmt); \ + else \ + __trace_puts(_THIS_IP_, str); \ +}) +extern int __trace_bputs(unsigned long ip, const char *str); +extern int __trace_puts(unsigned long ip, const char *str); + +extern void trace_dump_stack(int skip); + +/* + * The double __builtin_constant_p is because gcc will give us an error + * if we try to allocate the static variable to fmt if it is not a + * constant. Even with the outer if statement. + */ +#define ftrace_vprintk(fmt, vargs) \ +do { \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt __used \ + __section("__trace_printk_fmt") = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \ + } else \ + __ftrace_vprintk(_THIS_IP_, fmt, vargs); \ +} while (0) + +extern __printf(2, 0) int +__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap); + +extern __printf(2, 0) int +__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); + +extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode); +#else +static inline void tracing_start(void) { } +static inline void tracing_stop(void) { } +static inline void trace_dump_stack(int skip) { } + +static inline void tracing_on(void) { } +static inline void tracing_off(void) { } +static inline int tracing_is_on(void) { return 0; } +static inline void tracing_snapshot(void) { } +static inline void tracing_snapshot_alloc(void) { } + +static inline __printf(1, 2) +int trace_printk(const char *fmt, ...) +{ + return 0; +} +static __printf(1, 0) inline int +ftrace_vprintk(const char *fmt, va_list ap) +{ + return 0; +} +static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } +#endif /* CONFIG_TRACING */ + +#endif diff --git a/include/linux/types.h b/include/linux/types.h index d673747eda8a..7e71d260763c 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -2,7 +2,6 @@ #ifndef _LINUX_TYPES_H #define _LINUX_TYPES_H -#define __EXPORTED_HEADERS__ #include <uapi/linux/types.h> #ifndef __ASSEMBLY__ @@ -185,7 +184,7 @@ typedef phys_addr_t resource_size_t; typedef unsigned long irq_hw_number_t; typedef struct { - int counter; + int __aligned(sizeof(int)) counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index c47d4b9b88b3..85b1fff02fde 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -17,6 +17,7 @@ #ifndef __LINUX_WW_MUTEX_H #define __LINUX_WW_MUTEX_H +#include <linux/instruction_pointer.h> #include <linux/mutex.h> #include <linux/rtmutex.h> |
