diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.kexec | 11 | ||||
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/bpf/stream.c | 2 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 2 | ||||
-rw-r--r-- | kernel/crash_core.c | 30 | ||||
-rw-r--r-- | kernel/crash_core_test.c | 343 | ||||
-rw-r--r-- | kernel/dma/remap.c | 2 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 32 | ||||
-rw-r--r-- | kernel/exit.c | 23 | ||||
-rw-r--r-- | kernel/fork.c | 37 | ||||
-rw-r--r-- | kernel/freezer.c | 20 | ||||
-rw-r--r-- | kernel/hung_task.c | 78 | ||||
-rw-r--r-- | kernel/kallsyms_selftest.c | 2 | ||||
-rw-r--r-- | kernel/kcov.c | 9 | ||||
-rw-r--r-- | kernel/kexec_core.c | 1 | ||||
-rw-r--r-- | kernel/kexec_file.c | 1 | ||||
-rw-r--r-- | kernel/kexec_handover.c | 94 | ||||
-rw-r--r-- | kernel/locking/rtmutex_common.h | 9 | ||||
-rw-r--r-- | kernel/panic.c | 129 | ||||
-rw-r--r-- | kernel/printk/internal.h | 1 | ||||
-rw-r--r-- | kernel/printk/nbcon.c | 14 | ||||
-rw-r--r-- | kernel/printk/printk.c | 37 | ||||
-rw-r--r-- | kernel/printk/printk_ringbuffer.c | 2 | ||||
-rw-r--r-- | kernel/rcu/rcuscale.c | 2 | ||||
-rw-r--r-- | kernel/resource.c | 50 | ||||
-rw-r--r-- | kernel/sched/fair.c | 10 | ||||
-rw-r--r-- | kernel/sys.c | 101 | ||||
-rw-r--r-- | kernel/time/time.c | 1 | ||||
-rw-r--r-- | kernel/watchdog.c | 28 | ||||
-rw-r--r-- | kernel/watchdog_perf.c | 4 |
30 files changed, 846 insertions, 230 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 1224dd937df0..422270d64820 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -148,6 +148,17 @@ config CRASH_DM_CRYPT_CONFIGS CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that is required to be built-in. +config CRASH_DUMP_KUNIT_TEST + tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS + depends on CRASH_DUMP && KUNIT + default KUNIT_ALL_TESTS + help + This option builds KUnit unit tests for kernel crash dumps. The unit + tests will be used to verify the correctness of covered functions and + also prevent any regression. + + If unsure, say N. + config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" default y diff --git a/kernel/Makefile b/kernel/Makefile index 41751834e764..df3dd8291bb6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -78,6 +78,7 @@ obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o +obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index ab592db4a4bf..eb6c5a21c2ef 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -83,7 +83,7 @@ static struct bpf_stream_page *bpf_stream_page_replace(void) struct bpf_stream_page *stream_page, *old_stream_page; struct page *page; - page = alloc_pages_nolock(NUMA_NO_NODE, 0); + page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); if (!page) return NULL; stream_page = page_address(page); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a48fa86f82a7..2a9456a3e730 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -583,7 +583,7 @@ static bool can_alloc_pages(void) static struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) - return alloc_pages_nolock(nid, 0); + return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); return alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a4ef79591eb2..3b1c43382eec 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -22,6 +22,7 @@ #include <linux/btf.h> #include <linux/objtool.h> #include <linux/delay.h> +#include <linux/panic.h> #include <asm/page.h> #include <asm/sections.h> @@ -143,17 +144,7 @@ STACK_FRAME_NON_STANDARD(__crash_kexec); __bpf_kfunc void crash_kexec(struct pt_regs *regs) { - int old_cpu, this_cpu; - - /* - * Only one CPU is allowed to execute the crash_kexec() code as with - * panic(). Otherwise parallel calls of panic() and crash_kexec() - * may stop each other. To exclude them, we use panic_cpu here too. - */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); @@ -161,7 +152,7 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) * Reset panic_cpu to allow another panic()/crash_kexec() * call. */ - atomic_set(&panic_cpu, PANIC_CPU_INVALID); + panic_reset(); } } @@ -274,6 +265,20 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, return 0; } +/** + * crash_exclude_mem_range - exclude a mem range for existing ranges + * @mem: mem->range contains an array of ranges sorted in ascending order + * @mstart: the start of to-be-excluded range + * @mend: the start of to-be-excluded range + * + * If you are unsure if a range split will happen, to avoid function call + * failure because of -ENOMEM, always make sure + * mem->max_nr_ranges == mem->nr_ranges + 1 + * before calling the function each time. + * + * returns 0 if a memory range is excluded successfully + * return -ENOMEM if mem->ranges doesn't have space to hold split ranges + */ int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { @@ -333,6 +338,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } +EXPORT_SYMBOL_GPL(crash_exclude_mem_range); ssize_t crash_get_memory_size(void) { diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c new file mode 100644 index 000000000000..8aadf6801530 --- /dev/null +++ b/kernel/crash_core_test.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <kunit/test.h> +#include <linux/crash_core.h> // For struct crash_mem and struct range if defined there + +// Helper to create and initialize crash_mem +static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges, + unsigned int nr_initial_ranges, + const struct range *initial_ranges) +{ + struct crash_mem *mem; + size_t alloc_size; + + // Check if max_ranges can even hold initial_ranges + if (max_ranges < nr_initial_ranges) { + kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n", + max_ranges, nr_initial_ranges); + return NULL; + } + + alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range); + mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL); + if (!mem) { + kunit_err(test, "Failed to allocate crash_mem\n"); + return NULL; + } + + mem->max_nr_ranges = max_ranges; + mem->nr_ranges = nr_initial_ranges; + if (initial_ranges && nr_initial_ranges > 0) { + memcpy(mem->ranges, initial_ranges, + nr_initial_ranges * sizeof(struct range)); + } + + return mem; +} + +// Helper to compare ranges for assertions +static void assert_ranges_equal(struct kunit *test, + const struct range *actual_ranges, + unsigned int actual_nr_ranges, + const struct range *expected_ranges, + unsigned int expected_nr_ranges, + const char *case_name) +{ + unsigned int i; + + KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges, + "%s: Number of ranges mismatch.", case_name); + + for (i = 0; i < expected_nr_ranges; i++) { + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start, + "%s: Range %u start mismatch.", case_name, i); + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end, + "%s: Range %u end mismatch.", case_name, i); + } +} + +// Structure for test parameters +struct exclude_test_param { + const char *description; + unsigned long long exclude_start; + unsigned long long exclude_end; + unsigned int initial_max_ranges; + const struct range *initial_ranges; + unsigned int initial_nr_ranges; + const struct range *expected_ranges; + unsigned int expected_nr_ranges; + int expected_ret; +}; + +static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params) +{ + struct crash_mem *mem; + int ret; + + kunit_info(test, "%s", params->description); + + mem = create_crash_mem(test, params->initial_max_ranges, + params->initial_nr_ranges, params->initial_ranges); + if (!mem) + return; // Error already logged by create_crash_mem or kunit_kzalloc + + ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end); + + KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret, + "%s: Return value mismatch.", params->description); + + if (params->expected_ret == 0) { + assert_ranges_equal(test, mem->ranges, mem->nr_ranges, + params->expected_ranges, params->expected_nr_ranges, + params->description); + } else { + // If an error is expected, nr_ranges might still be relevant to check + // depending on the exact point of failure. For ENOMEM on split, + // nr_ranges shouldn't have changed. + KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges, + mem->nr_ranges, + "%s: Number of ranges mismatch on error.", + params->description); + } +} + +/* + * Test Strategy 1: One to-be-excluded range A and one existing range B. + * + * Exhaust all possibilities of the position of A regarding B. + */ + +static const struct range single_range_b = { .start = 100, .end = 199 }; + +static const struct exclude_test_param exclude_single_range_test_data[] = { + { + .description = "1.1: A is left of B, no overlap", + .exclude_start = 10, .exclude_end = 50, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.2: A's right boundary touches B's left boundary", + .exclude_start = 10, .exclude_end = 99, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.3: A overlaps B's left part", + .exclude_start = 50, .exclude_end = 149, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.4: A is completely inside B", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 119 }, + { .start = 180, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.5: A overlaps B's right part", + .exclude_start = 150, .exclude_end = 249, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.6: A's left boundary touches B's right boundary", + .exclude_start = 200, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.7: A is right of B, no overlap", + .exclude_start = 250, .exclude_end = 300, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.8: A completely covers B and extends beyond", + .exclude_start = 50, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.9: A covers B and extends to the left", + .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.10: A covers B and extends to the right", + .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.11: A is identical to B", + .exclude_start = 100, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.12: A is a point, left of B, no overlap", + .exclude_start = 10, .exclude_end = 10, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.13: A is a point, at start of B", + .exclude_start = 100, .exclude_end = 100, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.14: A is a point, in middle of B (causes split)", + .exclude_start = 150, .exclude_end = 150, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 149 }, + { .start = 151, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.15: A is a point, at end of B", + .exclude_start = 199, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.16: A is a point, right of B, no overlap", + .exclude_start = 250, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + // ENOMEM case for single range split + { + .description = "1.17: A completely inside B (split), no space (ENOMEM)", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 1, // Not enough for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 1, // Should remain unchanged + .expected_ret = -ENOMEM, + }, +}; + + +static void exclude_single_range_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description); + run_exclude_test_case(test, &exclude_single_range_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * Test Strategy 2: Regression test. + */ + +static const struct exclude_test_param exclude_range_regression_test_data[] = { + // Test data from commit a2e9a95d2190 + { + .description = "2.1: exclude low 1M", + .exclude_start = 0, .exclude_end = (1 << 20) - 1, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 0, .end = 0x3efff }, + { .start = 0x3f000, .end = 0x3ffff }, + { .start = 0x40000, .end = 0x9ffff } + }, + .initial_nr_ranges = 3, + .expected_nr_ranges = 0, + .expected_ret = 0, + }, + // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u + { + .description = "2.2: when range out of bound", + .exclude_start = 100, .exclude_end = 200, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 1, .end = 299 }, + { .start = 401, .end = 1000 }, + { .start = 1001, .end = 2000 } + }, + .initial_nr_ranges = 3, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 3, // Should remain unchanged + .expected_ret = -ENOMEM + }, + +}; + + +static void exclude_range_regression_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description); + run_exclude_test_case(test, &exclude_range_regression_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * KUnit Test Suite + */ +static struct kunit_case crash_exclude_mem_range_test_cases[] = { + KUNIT_CASE(exclude_single_range_test), + KUNIT_CASE(exclude_range_regression_test), + {} +}; + +static struct kunit_suite crash_exclude_mem_range_suite = { + .name = "crash_exclude_mem_range_tests", + .test_cases = crash_exclude_mem_range_test_cases, + // .init and .exit can be NULL if not needed globally for the suite +}; + +kunit_test_suite(crash_exclude_mem_range_suite); + +MODULE_DESCRIPTION("crash dump KUnit test suite"); +MODULE_LICENSE("GPL"); diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 9e2afad1c615..b7c1c0c92d0c 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -49,7 +49,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, if (!pages) return NULL; for (i = 0; i < count; i++) - pages[i] = nth_page(page, i); + pages[i] = page++; vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kvfree(pages); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5dcf927310fd..8709c69118b5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1163,15 +1163,15 @@ static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page(). */ - first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); + first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm); if (first_uprobe) - set_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_set(MMF_HAS_UPROBES, mm); ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) - clear_bit(MMF_RECALC_UPROBES, &mm->flags); + mm_flags_clear(MMF_RECALC_UPROBES, mm); else if (first_uprobe) - clear_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_clear(MMF_HAS_UPROBES, mm); return ret; } @@ -1181,7 +1181,7 @@ static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; - set_bit(MMF_RECALC_UPROBES, &mm->flags); + mm_flags_set(MMF_RECALC_UPROBES, mm); return set_orig_insn(&uprobe->arch, vma, vaddr); } @@ -1313,7 +1313,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, vma, info->vaddr); - } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { + } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) { if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, vma, info->vaddr); } @@ -1605,7 +1605,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (vma->vm_file && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && - test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm)) delayed_ref_ctr_inc(vma); if (!valid_vma(vma, true)) @@ -1665,12 +1665,12 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || - test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) + if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) || + mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm)) return; if (vma_has_uprobes(vma, start, end)) - set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); + mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm); } static vm_fault_t xol_fault(const struct vm_special_mapping *sm, @@ -1843,10 +1843,10 @@ void uprobe_end_dup_mmap(void) void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { - set_bit(MMF_HAS_UPROBES, &newmm->flags); + if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) { + mm_flags_set(MMF_HAS_UPROBES, newmm); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ - set_bit(MMF_RECALC_UPROBES, &newmm->flags); + mm_flags_set(MMF_RECALC_UPROBES, newmm); } } @@ -2390,7 +2390,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) return; } - clear_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_clear(MMF_HAS_UPROBES, mm); } static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) @@ -2488,7 +2488,7 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb *is_swbp = -EFAULT; } - if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) + if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm)) mmf_recalc_uprobes(mm); mmap_read_unlock(mm); @@ -2869,7 +2869,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) if (!current->mm) return 0; - if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) && (!current->utask || !current->utask->return_instances)) return 0; diff --git a/kernel/exit.c b/kernel/exit.c index 343eb97543d5..9f74e8f1c431 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -780,24 +780,29 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } #ifdef CONFIG_DEBUG_STACK_USAGE +#ifdef CONFIG_STACK_GROWSUP unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ -# ifdef CONFIG_STACK_GROWSUP n--; -# else - n++; -# endif } while (!*n); -# ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; -# else +} +#else /* !CONFIG_STACK_GROWSUP */ +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ + n++; + } while (!*n); + return (unsigned long)n - (unsigned long)end_of_stack(p); -# endif } +#endif /* CONFIG_STACK_GROWSUP */ /* Count the maximum pages reached in kernel stacks */ static inline void kstack_histogram(unsigned long used_stack) @@ -856,9 +861,9 @@ static void check_stack_usage(void) } spin_unlock(&low_water_lock); } -#else +#else /* !CONFIG_DEBUG_STACK_USAGE */ static inline void check_stack_usage(void) {} -#endif +#endif /* CONFIG_DEBUG_STACK_USAGE */ static void synchronize_group_exit(struct task_struct *tsk, long code) { diff --git a/kernel/fork.c b/kernel/fork.c index cffa6157a55a..3da0f08615a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -290,6 +290,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) if (!vm_area) continue; + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); + return -ENOMEM; + } + /* Reset stack metadata. */ kasan_unpoison_range(vm_area->addr, THREAD_SIZE); @@ -298,11 +303,6 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(vm_area)) { - vfree(vm_area->addr); - return -ENOMEM; - } - tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; @@ -1057,11 +1057,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_uprobes_state(mm); hugetlb_count_init(mm); + mm_flags_clear_all(mm); if (current->mm) { - mm->flags = mmf_init_flags(current->mm->flags); + unsigned long flags = __mm_flags_get_word(current->mm); + + __mm_flags_set_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - mm->flags = default_dump_filter; + __mm_flags_set_word(mm, default_dump_filter); mm->def_flags = 0; } @@ -1889,7 +1892,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) /* We need to synchronize with __set_oom_adj */ mutex_lock(&oom_adj_mutex); - set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + mm_flags_set(MMF_MULTIPROCESS, tsk->mm); /* Update the values in case they were changed after copy_signal */ tsk->signal->oom_score_adj = current->signal->oom_score_adj; tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -2129,9 +2132,7 @@ __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; -#ifdef CONFIG_LOCKDEP lockdep_init_task(p); -#endif p->blocked_on = NULL; /* not blocked yet */ @@ -2544,11 +2545,9 @@ struct task_struct * __init fork_idle(int cpu) struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| - CLONE_IO; + CLONE_IO|CLONE_VM|CLONE_UNTRACED; struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = flags, .fn = fn, .fn_arg = arg, .io_thread = 1, @@ -2660,9 +2659,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, .name = name, @@ -2678,9 +2676,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, }; diff --git a/kernel/freezer.c b/kernel/freezer.c index 6a96149aede9..ddc11a8bd2ea 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -10,6 +10,7 @@ #include <linux/export.h> #include <linux/syscalls.h> #include <linux/freezer.h> +#include <linux/oom.h> #include <linux/kthread.h> /* total number of freezing conditions in effect */ @@ -40,7 +41,7 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; - if (test_tsk_thread_flag(p, TIF_MEMDIE)) + if (tsk_is_oom_victim(p)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) @@ -206,6 +207,23 @@ void __thaw_task(struct task_struct *p) wake_up_state(p, TASK_FROZEN); } +/* + * thaw_process - Thaw a frozen process + * @p: the process to be thawed + * + * Iterate over all threads of @p and call __thaw_task() on each. + */ +void thaw_process(struct task_struct *p) +{ + struct task_struct *t; + + rcu_read_lock(); + for_each_thread(p, t) { + __thaw_task(t); + } + rcu_read_unlock(); +} + /** * set_freezable - make %current freezable * diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 8708a1205f82..b2c1f14b8129 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -95,9 +95,41 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; +static bool task_is_hung(struct task_struct *t, unsigned long timeout) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + unsigned int state = READ_ONCE(t->__state); + + /* + * skip the TASK_KILLABLE tasks -- these can be killed + * skip the TASK_IDLE tasks -- those are genuinely idle + * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer + */ + if (!(state & TASK_UNINTERRUPTIBLE) || + (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN))) + return false; + + /* + * When a freshly created task is scheduled once, changes its state to + * TASK_UNINTERRUPTIBLE without having ever been switched out once, it + * musn't be checked. + */ + if (unlikely(!switch_count)) + return false; + + if (switch_count != t->last_switch_count) { + t->last_switch_count = switch_count; + t->last_switch_time = jiffies; + return false; + } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return false; + + return true; +} #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER -static void debug_show_blocker(struct task_struct *task) +static void debug_show_blocker(struct task_struct *task, unsigned long timeout) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; @@ -174,41 +206,21 @@ static void debug_show_blocker(struct task_struct *task) t->pid, rwsem_blocked_by); break; } - sched_show_task(t); + /* Avoid duplicated task dump, skip if the task is also hung. */ + if (!task_is_hung(t, timeout)) + sched_show_task(t); return; } } #else -static inline void debug_show_blocker(struct task_struct *task) +static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout) { } #endif static void check_hung_task(struct task_struct *t, unsigned long timeout) { - unsigned long switch_count = t->nvcsw + t->nivcsw; - - /* - * Ensure the task is not frozen. - * Also, skip vfork and any other user process that freezer should skip. - */ - if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) - return; - - /* - * When a freshly created task is scheduled once, changes its state to - * TASK_UNINTERRUPTIBLE without having ever been switched out once, it - * musn't be checked. - */ - if (unlikely(!switch_count)) - return; - - if (switch_count != t->last_switch_count) { - t->last_switch_count = switch_count; - t->last_switch_time = jiffies; - return; - } - if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + if (!task_is_hung(t, timeout)) return; /* @@ -243,7 +255,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_blocker(t); + debug_show_blocker(t, timeout); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) @@ -299,7 +311,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { - unsigned int state; if (!max_count--) goto unlock; @@ -308,15 +319,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } - /* - * skip the TASK_KILLABLE tasks -- these can be killed - * skip the TASK_IDLE tasks -- those are genuinely idle - */ - state = READ_ONCE(t->__state); - if ((state & TASK_UNINTERRUPTIBLE) && - !(state & TASK_WAKEKILL) && - !(state & TASK_NOLOAD)) - check_hung_task(t, timeout); + + check_hung_task(t, timeout); } unlock: rcu_read_unlock(); diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index cf4af5728307..2b082a7e24a2 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -264,7 +264,7 @@ static int test_kallsyms_basic_function(void) char namebuf[KSYM_NAME_LEN]; struct test_stat *stat, *stat2; - stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL); + stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL); if (!stat) return -ENOMEM; stat2 = stat + 1; diff --git a/kernel/kcov.c b/kernel/kcov.c index 1d85597057e1..6563141f5de9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -978,6 +978,15 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, memcpy(dst_entries, src_entries, bytes_to_move); entries_moved = bytes_to_move >> entry_size_log; + /* + * A write memory barrier is required here, to ensure + * that the writes from the memcpy() are visible before + * the count is updated. Without this, it is possible for + * a user to observe a new count value but stale + * coverage data. + */ + smp_wmb(); + switch (mode) { case KCOV_MODE_TRACE_PC: WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 31203f0bacaf..fa00b239c5d9 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -233,7 +233,6 @@ struct kimage *do_kimage_alloc_init(void) if (!image) return NULL; - image->head = 0; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 91d46502a817..eb62a9794242 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index ecd1ac210dbd..5083c68c3a4e 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -32,6 +32,22 @@ #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" #define PROP_SUB_FDT "fdt" +#define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */ + +/* + * KHO uses page->private, which is an unsigned long, to store page metadata. + * Use it to store both the magic and the order. + */ +union kho_page_info { + unsigned long page_private; + struct { + unsigned int order; + unsigned int magic; + }; +}; + +static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); + static bool kho_enable __ro_after_init; bool kho_is_enabled(void) @@ -183,11 +199,27 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, return 0; } -/* almost as free_reserved_page(), just don't free the page */ -static void kho_restore_page(struct page *page, unsigned int order) +static struct page *kho_restore_page(phys_addr_t phys) { - unsigned int nr_pages = (1 << order); + struct page *page = pfn_to_online_page(PHYS_PFN(phys)); + union kho_page_info info; + unsigned int nr_pages; + + if (!page) + return NULL; + info.page_private = page->private; + /* + * deserialize_bitmap() only sets the magic on the head page. This magic + * check also implicitly makes sure phys is order-aligned since for + * non-order-aligned phys addresses, magic will never be set. + */ + if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER)) + return NULL; + nr_pages = (1 << info.order); + + /* Clear private to make sure later restores on this page error out. */ + page->private = 0; /* Head page gets refcount of 1. */ set_page_count(page, 1); @@ -195,10 +227,11 @@ static void kho_restore_page(struct page *page, unsigned int order) for (unsigned int i = 1; i < nr_pages; i++) set_page_count(page + i, 0); - if (order > 0) - prep_compound_page(page, order); + if (info.order > 0) + prep_compound_page(page, info.order); adjust_managed_page_count(page, nr_pages); + return page; } /** @@ -209,18 +242,9 @@ static void kho_restore_page(struct page *page, unsigned int order) */ struct folio *kho_restore_folio(phys_addr_t phys) { - struct page *page = pfn_to_online_page(PHYS_PFN(phys)); - unsigned long order; + struct page *page = kho_restore_page(phys); - if (!page) - return NULL; - - order = page->private; - if (order > MAX_PAGE_ORDER) - return NULL; - - kho_restore_page(page, order); - return page_folio(page); + return page ? page_folio(page) : NULL; } EXPORT_SYMBOL_GPL(kho_restore_folio); @@ -341,10 +365,13 @@ static void __init deserialize_bitmap(unsigned int order, phys_addr_t phys = elm->phys_start + (bit << (order + PAGE_SHIFT)); struct page *page = phys_to_page(phys); + union kho_page_info info; memblock_reserve(phys, sz); memblock_reserved_mark_noinit(phys, sz); - page->private = order; + info.magic = KHO_PAGE_MAGIC; + info.order = order; + page->private = info.page_private; } } @@ -405,6 +432,7 @@ static int __init kho_parse_scratch_size(char *p) { size_t len; unsigned long sizes[3]; + size_t total_size = 0; int i; if (!p) @@ -441,11 +469,19 @@ static int __init kho_parse_scratch_size(char *p) } sizes[i] = memparse(p, &endp); - if (!sizes[i] || endp == p) + if (endp == p) return -EINVAL; p = endp; + total_size += sizes[i]; } + if (!total_size) + return -EINVAL; + + /* The string should be fully consumed by now. */ + if (*p) + return -EINVAL; + scratch_size_lowmem = sizes[0]; scratch_size_global = sizes[1]; scratch_size_pernode = sizes[2]; @@ -952,6 +988,26 @@ static const void *kho_get_fdt(void) } /** + * is_kho_boot - check if current kernel was booted via KHO-enabled + * kexec + * + * This function checks if the current kernel was loaded through a kexec + * operation with KHO enabled, by verifying that a valid KHO FDT + * was passed. + * + * Note: This function returns reliable results only after + * kho_populate() has been called during early boot. Before that, + * it may return false even if KHO data is present. + * + * Return: true if booted via KHO-enabled kexec, false otherwise + */ +bool is_kho_boot(void) +{ + return !!kho_get_fdt(); +} +EXPORT_SYMBOL_GPL(is_kho_boot); + +/** * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. * @name: the name of the sub FDT passed to kho_add_subtree(). * @phys: if found, the physical address of the sub FDT is stored in @phys. @@ -1233,7 +1289,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_enable) + if (!kho_out.finalized) return 0; image->kho.fdt = page_to_phys(kho_out.ser.fdt); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 78dd3d8c6554..cf6ddd1b23a2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -153,15 +153,6 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) pi_tree.entry); } -#define RT_MUTEX_HAS_WAITERS 1UL - -static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) -{ - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); - - return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); -} - /* * Constants for rt mutex functions which have a selectable deadlock * detection. diff --git a/kernel/panic.c b/kernel/panic.c index 72fcbb5a071b..24cc3eec1805 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -53,7 +53,7 @@ static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; #define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ -int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; +int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS); static unsigned long tainted_mask = IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; @@ -67,6 +67,7 @@ static unsigned int warn_limit __read_mostly; static bool panic_console_replay; bool panic_triggering_all_cpu_backtrace; +static bool panic_this_cpu_backtrace_printed; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -77,6 +78,11 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +static void panic_print_deprecated(void) +{ + pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n"); +} + #ifdef CONFIG_SYSCTL /* @@ -125,7 +131,7 @@ static int proc_taint(const struct ctl_table *table, int write, static int sysctl_panic_print_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + panic_print_deprecated(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } @@ -294,6 +300,59 @@ void __weak crash_smp_send_stop(void) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +bool panic_try_start(void) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); + + return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu); +} +EXPORT_SYMBOL(panic_try_start); + +void panic_reset(void) +{ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_reset); + +bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_in_progress); + +/* Return true if a panic is in progress on the current CPU. */ +bool panic_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); +} +EXPORT_SYMBOL(panic_on_this_cpu); + +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool panic_on_other_cpu(void) +{ + return (panic_in_progress() && !panic_on_this_cpu()); +} +EXPORT_SYMBOL(panic_on_other_cpu); + /* * A variant of panic() called from NMI context. We return if we've already * panicked on this CPU. If another CPU already panicked, loop in @@ -302,15 +361,9 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); */ void nmi_panic(struct pt_regs *regs, const char *msg) { - int old_cpu, this_cpu; - - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) + if (panic_try_start()) panic("%s", msg); - else if (old_cpu != this_cpu) + else if (panic_on_other_cpu()) nmi_panic_self_stop(regs); } EXPORT_SYMBOL(nmi_panic); @@ -328,6 +381,19 @@ void check_panic_on_warn(const char *origin) origin, limit); } +static void panic_trigger_all_cpu_backtrace(void) +{ + /* Temporary allow non-panic CPUs to write their backtraces. */ + panic_triggering_all_cpu_backtrace = true; + + if (panic_this_cpu_backtrace_printed) + trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id()); + else + trigger_all_cpu_backtrace(); + + panic_triggering_all_cpu_backtrace = false; +} + /* * Helper that triggers the NMI backtrace (if set in panic_print) * and then performs the secondary CPUs shutdown - we cannot have @@ -335,12 +401,8 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & SYS_INFO_ALL_CPU_BT) { - /* Temporary allow non-panic CPUs to write their backtraces. */ - panic_triggering_all_cpu_backtrace = true; - trigger_all_cpu_backtrace(); - panic_triggering_all_cpu_backtrace = false; - } + if (panic_print & SYS_INFO_ALL_CPU_BT) + panic_trigger_all_cpu_backtrace(); /* * Note that smp_send_stop() is the usual SMP shutdown function, @@ -368,7 +430,6 @@ void vpanic(const char *fmt, va_list args) static char buf[1024]; long i, i_next = 0, len; int state = 0; - int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; if (panic_on_warn) { @@ -405,13 +466,10 @@ void vpanic(const char *fmt, va_list args) * `old_cpu == this_cpu' means we came from nmi_panic() which sets * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* go ahead */ - } else if (old_cpu != this_cpu) + } else if (panic_on_other_cpu()) panic_smp_self_stop(); console_verbose(); @@ -422,13 +480,15 @@ void vpanic(const char *fmt, va_list args) buf[len - 1] = '\0'; pr_emerg("Kernel panic - not syncing: %s\n", buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) + if (test_taint(TAINT_DIE) || oops_in_progress > 1) { + panic_this_cpu_backtrace_printed = true; + } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { dump_stack(); -#endif + panic_this_cpu_backtrace_printed = true; + } /* * If kgdb is enabled, give it a chance to run before we stop all @@ -937,12 +997,29 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif core_param(panic, panic_timeout, int, 0644); -core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); core_param(panic_console_replay, panic_console_replay, bool, 0644); +static int panic_print_set(const char *val, const struct kernel_param *kp) +{ + panic_print_deprecated(); + return param_set_ulong(val, kp); +} + +static int panic_print_get(char *val, const struct kernel_param *kp) +{ + panic_print_deprecated(); + return param_get_ulong(val, kp); +} + +static const struct kernel_param_ops panic_print_ops = { + .set = panic_print_set, + .get = panic_print_get, +}; +__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644); + static int __init oops_setup(char *s) { if (!s) diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index ef282001f200..f72bbfa266d6 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -332,7 +332,6 @@ struct printk_message { unsigned long dropped; }; -bool other_cpu_in_panic(void); bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_supress); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 646801813415..558ef3177976 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -12,6 +12,7 @@ #include <linux/irqflags.h> #include <linux/kthread.h> #include <linux/minmax.h> +#include <linux/panic.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/slab.h> @@ -254,7 +255,7 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, * opportunity to perform any necessary cleanup if they were * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; } @@ -309,7 +310,7 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. * Event #4 occurs when a non-panic CPU reacquires. - * Event #5 is not possible due to the other_cpu_in_panic() check + * Event #5 is not possible due to the panic_on_other_cpu() check * in nbcon_context_try_acquire_handover(). */ @@ -348,7 +349,7 @@ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, struct nbcon_state new; /* Note that the caller must still remove the request! */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* @@ -446,7 +447,7 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, * nbcon_waiter_matches(). In particular, the assumption that * lower priorities are ignored during panic. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* Handover is not possible on the same CPU. */ @@ -589,7 +590,6 @@ static struct printk_buffers panic_nbcon_pbufs; */ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { - unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; int err; @@ -614,7 +614,7 @@ out: /* Acquire succeeded. */ /* Assign the appropriate buffer for this context. */ - if (atomic_read(&panic_cpu) == cpu) + if (panic_on_this_cpu()) ctxt->pbufs = &panic_nbcon_pbufs; else ctxt->pbufs = con->pbufs; @@ -1394,7 +1394,7 @@ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; - if (this_cpu_in_panic()) + if (panic_on_this_cpu()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0efbcdda9aab..5aee9ffb16b9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -48,6 +48,7 @@ #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <linux/panic.h> #include <linux/uaccess.h> #include <asm/sections.h> @@ -345,34 +346,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -static bool panic_in_progress(void) -{ - return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); -} - -/* Return true if a panic is in progress on the current CPU. */ -bool this_cpu_in_panic(void) -{ - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); -} - -/* - * Return true if a panic is in progress on a remote CPU. - * - * On true, the local CPU should immediately release any printing resources - * that may be needed by the panic CPU. - */ -bool other_cpu_in_panic(void) -{ - return (panic_in_progress() && !this_cpu_in_panic()); -} - /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -2407,7 +2380,7 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && !debug_non_panic_cpus && !panic_triggering_all_cpu_backtrace) return 0; @@ -2843,7 +2816,7 @@ void console_lock(void) might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ - while (other_cpu_in_panic()) + while (panic_on_other_cpu()) msleep(1000); down_console_sem(); @@ -2863,7 +2836,7 @@ EXPORT_SYMBOL(console_lock); int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return 0; if (down_trylock_console_sem()) return 0; @@ -3243,7 +3216,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) goto abandon; if (do_cond_resched) diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index d9fb053cff67..e2a1b2d34d2b 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2143,7 +2143,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && + if (panic_on_this_cpu() && (!debug_non_panic_cpus || legacy_allow_panic_sync) && ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index b521d0455992..7484d8ad5767 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -796,7 +796,7 @@ kfree_scale_thread(void *arg) pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), - (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + PAGES_TO_MB(mem_begin - mem_during)); if (shutdown) { smp_mb(); /* Assign before wake. */ diff --git a/kernel/resource.c b/kernel/resource.c index f9bb5481501a..b9fa2a4ce089 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1388,6 +1388,47 @@ void __release_region(struct resource *parent, resource_size_t start, EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE +static void append_child_to_parent(struct resource *new_parent, struct resource *new_child) +{ + struct resource *child; + + child = new_parent->child; + if (child) { + while (child->sibling) + child = child->sibling; + child->sibling = new_child; + } else { + new_parent->child = new_child; + } + new_child->parent = new_parent; + new_child->sibling = NULL; +} + +/* + * Reparent all child resources that no longer belong to "low" after a split to + * "high". Note that "high" does not have any children, because "low" is the + * original resource and "high" is a new resource. Treat "low" as the original + * resource being split and defer its range adjustment to __adjust_resource(). + */ +static void reparent_children_after_split(struct resource *low, + struct resource *high, + resource_size_t split_addr) +{ + struct resource *child, *next, **p; + + p = &low->child; + while ((child = *p)) { + next = child->sibling; + if (child->start > split_addr) { + /* unlink child */ + *p = next; + append_child_to_parent(high, child); + } else { + p = &child->sibling; + } + } +} + /** * release_mem_region_adjustable - release a previously reserved memory region * @start: resource start address @@ -1397,15 +1438,13 @@ EXPORT_SYMBOL(__release_region); * is released from a currently busy memory resource. The requested region * must either match exactly or fit into a single busy resource entry. In * the latter case, the remaining resource is adjusted accordingly. - * Existing children of the busy memory resource must be immutable in the - * request. * * Note: * - Additional release conditions, such as overlapping region, can be * supported after they are confirmed as valid cases. - * - When a busy memory resource gets split into two entries, the code - * assumes that all children remain in the lower address entry for - * simplicity. Enhance this logic when necessary. + * - When a busy memory resource gets split into two entries, its children are + * reassigned to the correct parent based on their range. If a child memory + * resource overlaps with more than one parent, enhance the logic as needed. */ void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { @@ -1482,6 +1521,7 @@ retry: new_res->parent = res->parent; new_res->sibling = res->sibling; new_res->child = NULL; + reparent_children_after_split(res, new_res, end); if (WARN_ON_ONCE(__adjust_resource(res, res->start, start - res->start))) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a89f949e307..bc0b7ce8a65d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1495,7 +1495,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) * by the PTE scanner and NUMA hinting faults should be trapped based * on resident pages */ - nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size); rss = get_mm_rss(p->mm); if (!rss) rss = nr_scan_pages; @@ -1923,17 +1923,18 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, struct pglist_data *pgdat; unsigned long rate_limit; unsigned int latency, th, def_th; + long nr = folio_nr_pages(folio); pgdat = NODE_DATA(dst_nid); if (pgdat_free_space_enough(pgdat)) { /* workload changed, reset hot threshold */ pgdat->nbp_threshold = 0; + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr); return true; } def_th = sysctl_numa_balancing_hot_threshold; - rate_limit = sysctl_numa_balancing_promote_rate_limit << \ - (20 - PAGE_SHIFT); + rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit); numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); th = pgdat->nbp_threshold ? : def_th; @@ -1941,8 +1942,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, if (latency >= th) return false; - return !numa_promotion_rate_limit(pgdat, rate_limit, - folio_nr_pages(folio)); + return !numa_promotion_rate_limit(pgdat, rate_limit, nr); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); diff --git a/kernel/sys.c b/kernel/sys.c index 1e28b40053ce..8b58eece4e58 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1734,6 +1734,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, struct rlimit old, new; struct task_struct *tsk; unsigned int checkflags = 0; + bool need_tasklist; int ret; if (old_rlim) @@ -1760,8 +1761,25 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, get_task_struct(tsk); rcu_read_unlock(); - ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, - old_rlim ? &old : NULL); + need_tasklist = !same_thread_group(tsk, current); + if (need_tasklist) { + /* + * Ensure we can't race with group exit or de_thread(), + * so tsk->group_leader can't be freed or changed until + * read_unlock(tasklist_lock) below. + */ + read_lock(&tasklist_lock); + if (!pid_alive(tsk)) + ret = -ESRCH; + } + + if (!ret) { + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + } + + if (need_tasklist) + read_unlock(&tasklist_lock); if (!ret && old_rlim) { rlim_to_rlim64(&old, &old64); @@ -2392,9 +2410,9 @@ static inline unsigned long get_current_mdwe(void) { unsigned long ret = 0; - if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE, current->mm)) ret |= PR_MDWE_REFUSE_EXEC_GAIN; - if (test_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE_NO_INHERIT, current->mm)) ret |= PR_MDWE_NO_INHERIT; return ret; @@ -2427,9 +2445,9 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, return -EPERM; /* Cannot unset the flags */ if (bits & PR_MDWE_NO_INHERIT) - set_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags); + mm_flags_set(MMF_HAS_MDWE_NO_INHERIT, current->mm); if (bits & PR_MDWE_REFUSE_EXEC_GAIN) - set_bit(MMF_HAS_MDWE, ¤t->mm->flags); + mm_flags_set(MMF_HAS_MDWE, current->mm); return 0; } @@ -2452,6 +2470,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len) return sizeof(mm->saved_auxv); } +static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + /* If disabled, we return "1 | flags", otherwise 0. */ + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm)) + return 1; + else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm)) + return 1 | PR_THP_DISABLE_EXCEPT_ADVISED; + return 0; +} + +static int prctl_set_thp_disable(bool thp_disable, unsigned long flags, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg4 || arg5) + return -EINVAL; + + /* Flags are only allowed when disabling. */ + if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED)) + return -EINVAL; + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + if (thp_disable) { + if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } else { + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + } else { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + mmap_write_unlock(current->mm); + return 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2470,7 +2533,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = -EINVAL; break; } + /* + * Ensure that either: + * + * 1. Subsequent getppid() calls reflect the parent process having died. + * 2. forget_original_parent() will send the new me->pdeath_signal. + * + * Also prevent the read of me->pdeath_signal from being a data race. + */ + read_lock(&tasklist_lock); me->pdeath_signal = arg2; + read_unlock(&tasklist_lock); break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); @@ -2625,20 +2698,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); + error = prctl_get_thp_disable(arg2, arg3, arg4, arg5); break; case PR_SET_THP_DISABLE: - if (arg3 || arg4 || arg5) - return -EINVAL; - if (mmap_write_lock_killable(me->mm)) - return -EINTR; - if (arg2) - set_bit(MMF_DISABLE_THP, &me->mm->flags); - else - clear_bit(MMF_DISABLE_THP, &me->mm->flags); - mmap_write_unlock(me->mm); + error = prctl_set_thp_disable(arg2, arg3, arg4, arg5); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: @@ -2770,7 +2833,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); + error = !!mm_flags_test(MMF_VM_MERGE_ANY, me->mm); break; #endif case PR_RISCV_V_SET_CONTROL: diff --git a/kernel/time/time.c b/kernel/time/time.c index 1b69caa87480..0ba8e3c50d62 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -858,6 +858,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } +EXPORT_SYMBOL_GPL(timespec64_add_safe); /** * get_timespec64 - get user's time value into kernel space diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 80b56c002c7f..5b62d1002783 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -425,7 +425,11 @@ static DEFINE_PER_CPU(u8, cpustat_tail); */ static u16 get_16bit_precision(u64 data_ns) { - return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ + /* + * 2^24ns ~= 16.8ms + * Round to the nearest multiple of 16.8 milliseconds. + */ + return (data_ns + (1 << 23)) >> 24LL; } static void update_cpustat(void) @@ -444,6 +448,14 @@ static void update_cpustat(void) old_stat = __this_cpu_read(cpustat_old[i]); new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); + /* + * Since we use 16-bit precision, the raw data will undergo + * integer division, which may sometimes result in data loss, + * and then result might exceed 100%. To avoid confusion, + * we enforce a 100% display cap when calculations exceed this threshold. + */ + if (util > 100) + util = 100; __this_cpu_write(cpustat_util[tail][i], util); __this_cpu_write(cpustat_old[i], new_stat); } @@ -455,17 +467,17 @@ static void print_cpustat(void) { int i, group; u8 tail = __this_cpu_read(cpustat_tail); - u64 sample_period_second = sample_period; + u64 sample_period_msecond = sample_period; - do_div(sample_period_second, NSEC_PER_SEC); + do_div(sample_period_msecond, NSEC_PER_MSEC); /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ - printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", - smp_processor_id(), sample_period_second); + printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n", + smp_processor_id(), sample_period_msecond); for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { group = (tail + i) % NUM_SAMPLE_PERIODS; @@ -740,6 +752,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (!watchdog_enabled) return HRTIMER_NORESTART; + /* + * pass the buddy check if a panic is in process + */ + if (panic_in_progress()) + return HRTIMER_NORESTART; + watchdog_hardlockup_kick(); /* kick the softlockup detector */ diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 9c58f5b4381d..d3ca70e3c256 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "NMI watchdog: " fmt +#include <linux/panic.h> #include <linux/nmi.h> #include <linux/atomic.h> #include <linux/module.h> @@ -108,6 +109,9 @@ static void watchdog_overflow_callback(struct perf_event *event, /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; + if (panic_in_progress()) + return; + if (!watchdog_check_timestamp()) return; |