summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec11
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bpf/stream.c2
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/crash_core.c30
-rw-r--r--kernel/crash_core_test.c343
-rw-r--r--kernel/dma/remap.c2
-rw-r--r--kernel/events/uprobes.c32
-rw-r--r--kernel/exit.c23
-rw-r--r--kernel/fork.c37
-rw-r--r--kernel/freezer.c20
-rw-r--r--kernel/hung_task.c78
-rw-r--r--kernel/kallsyms_selftest.c2
-rw-r--r--kernel/kcov.c9
-rw-r--r--kernel/kexec_core.c1
-rw-r--r--kernel/kexec_file.c1
-rw-r--r--kernel/kexec_handover.c94
-rw-r--r--kernel/locking/rtmutex_common.h9
-rw-r--r--kernel/panic.c129
-rw-r--r--kernel/printk/internal.h1
-rw-r--r--kernel/printk/nbcon.c14
-rw-r--r--kernel/printk/printk.c37
-rw-r--r--kernel/printk/printk_ringbuffer.c2
-rw-r--r--kernel/rcu/rcuscale.c2
-rw-r--r--kernel/resource.c50
-rw-r--r--kernel/sched/fair.c10
-rw-r--r--kernel/sys.c101
-rw-r--r--kernel/time/time.c1
-rw-r--r--kernel/watchdog.c28
-rw-r--r--kernel/watchdog_perf.c4
30 files changed, 846 insertions, 230 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 1224dd937df0..422270d64820 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -148,6 +148,17 @@ config CRASH_DM_CRYPT_CONFIGS
CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that
is required to be built-in.
+config CRASH_DUMP_KUNIT_TEST
+ tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS
+ depends on CRASH_DUMP && KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This option builds KUnit unit tests for kernel crash dumps. The unit
+ tests will be used to verify the correctness of covered functions and
+ also prevent any regression.
+
+ If unsure, say N.
+
config CRASH_HOTPLUG
bool "Update the crash elfcorehdr on system configuration changes"
default y
diff --git a/kernel/Makefile b/kernel/Makefile
index 41751834e764..df3dd8291bb6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -78,6 +78,7 @@ obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_CRASH_DUMP) += crash_core.o
obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o
+obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
index ab592db4a4bf..eb6c5a21c2ef 100644
--- a/kernel/bpf/stream.c
+++ b/kernel/bpf/stream.c
@@ -83,7 +83,7 @@ static struct bpf_stream_page *bpf_stream_page_replace(void)
struct bpf_stream_page *stream_page, *old_stream_page;
struct page *page;
- page = alloc_pages_nolock(NUMA_NO_NODE, 0);
+ page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0);
if (!page)
return NULL;
stream_page = page_address(page);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a48fa86f82a7..2a9456a3e730 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -583,7 +583,7 @@ static bool can_alloc_pages(void)
static struct page *__bpf_alloc_page(int nid)
{
if (!can_alloc_pages())
- return alloc_pages_nolock(nid, 0);
+ return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0);
return alloc_pages_node(nid,
GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a4ef79591eb2..3b1c43382eec 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -22,6 +22,7 @@
#include <linux/btf.h>
#include <linux/objtool.h>
#include <linux/delay.h>
+#include <linux/panic.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -143,17 +144,7 @@ STACK_FRAME_NON_STANDARD(__crash_kexec);
__bpf_kfunc void crash_kexec(struct pt_regs *regs)
{
- int old_cpu, this_cpu;
-
- /*
- * Only one CPU is allowed to execute the crash_kexec() code as with
- * panic(). Otherwise parallel calls of panic() and crash_kexec()
- * may stop each other. To exclude them, we use panic_cpu here too.
- */
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ if (panic_try_start()) {
/* This is the 1st CPU which comes here, so go ahead. */
__crash_kexec(regs);
@@ -161,7 +152,7 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs)
* Reset panic_cpu to allow another panic()/crash_kexec()
* call.
*/
- atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+ panic_reset();
}
}
@@ -274,6 +265,20 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
return 0;
}
+/**
+ * crash_exclude_mem_range - exclude a mem range for existing ranges
+ * @mem: mem->range contains an array of ranges sorted in ascending order
+ * @mstart: the start of to-be-excluded range
+ * @mend: the start of to-be-excluded range
+ *
+ * If you are unsure if a range split will happen, to avoid function call
+ * failure because of -ENOMEM, always make sure
+ * mem->max_nr_ranges == mem->nr_ranges + 1
+ * before calling the function each time.
+ *
+ * returns 0 if a memory range is excluded successfully
+ * return -ENOMEM if mem->ranges doesn't have space to hold split ranges
+ */
int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
{
@@ -333,6 +338,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
return 0;
}
+EXPORT_SYMBOL_GPL(crash_exclude_mem_range);
ssize_t crash_get_memory_size(void)
{
diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c
new file mode 100644
index 000000000000..8aadf6801530
--- /dev/null
+++ b/kernel/crash_core_test.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+#include <linux/crash_core.h> // For struct crash_mem and struct range if defined there
+
+// Helper to create and initialize crash_mem
+static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges,
+ unsigned int nr_initial_ranges,
+ const struct range *initial_ranges)
+{
+ struct crash_mem *mem;
+ size_t alloc_size;
+
+ // Check if max_ranges can even hold initial_ranges
+ if (max_ranges < nr_initial_ranges) {
+ kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n",
+ max_ranges, nr_initial_ranges);
+ return NULL;
+ }
+
+ alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range);
+ mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL);
+ if (!mem) {
+ kunit_err(test, "Failed to allocate crash_mem\n");
+ return NULL;
+ }
+
+ mem->max_nr_ranges = max_ranges;
+ mem->nr_ranges = nr_initial_ranges;
+ if (initial_ranges && nr_initial_ranges > 0) {
+ memcpy(mem->ranges, initial_ranges,
+ nr_initial_ranges * sizeof(struct range));
+ }
+
+ return mem;
+}
+
+// Helper to compare ranges for assertions
+static void assert_ranges_equal(struct kunit *test,
+ const struct range *actual_ranges,
+ unsigned int actual_nr_ranges,
+ const struct range *expected_ranges,
+ unsigned int expected_nr_ranges,
+ const char *case_name)
+{
+ unsigned int i;
+
+ KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges,
+ "%s: Number of ranges mismatch.", case_name);
+
+ for (i = 0; i < expected_nr_ranges; i++) {
+ KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start,
+ "%s: Range %u start mismatch.", case_name, i);
+ KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end,
+ "%s: Range %u end mismatch.", case_name, i);
+ }
+}
+
+// Structure for test parameters
+struct exclude_test_param {
+ const char *description;
+ unsigned long long exclude_start;
+ unsigned long long exclude_end;
+ unsigned int initial_max_ranges;
+ const struct range *initial_ranges;
+ unsigned int initial_nr_ranges;
+ const struct range *expected_ranges;
+ unsigned int expected_nr_ranges;
+ int expected_ret;
+};
+
+static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params)
+{
+ struct crash_mem *mem;
+ int ret;
+
+ kunit_info(test, "%s", params->description);
+
+ mem = create_crash_mem(test, params->initial_max_ranges,
+ params->initial_nr_ranges, params->initial_ranges);
+ if (!mem)
+ return; // Error already logged by create_crash_mem or kunit_kzalloc
+
+ ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end);
+
+ KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret,
+ "%s: Return value mismatch.", params->description);
+
+ if (params->expected_ret == 0) {
+ assert_ranges_equal(test, mem->ranges, mem->nr_ranges,
+ params->expected_ranges, params->expected_nr_ranges,
+ params->description);
+ } else {
+ // If an error is expected, nr_ranges might still be relevant to check
+ // depending on the exact point of failure. For ENOMEM on split,
+ // nr_ranges shouldn't have changed.
+ KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges,
+ mem->nr_ranges,
+ "%s: Number of ranges mismatch on error.",
+ params->description);
+ }
+}
+
+/*
+ * Test Strategy 1: One to-be-excluded range A and one existing range B.
+ *
+ * Exhaust all possibilities of the position of A regarding B.
+ */
+
+static const struct range single_range_b = { .start = 100, .end = 199 };
+
+static const struct exclude_test_param exclude_single_range_test_data[] = {
+ {
+ .description = "1.1: A is left of B, no overlap",
+ .exclude_start = 10, .exclude_end = 50,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.2: A's right boundary touches B's left boundary",
+ .exclude_start = 10, .exclude_end = 99,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.3: A overlaps B's left part",
+ .exclude_start = 50, .exclude_end = 149,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.4: A is completely inside B",
+ .exclude_start = 120, .exclude_end = 179,
+ .initial_max_ranges = 2, // Needs space for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){
+ { .start = 100, .end = 119 },
+ { .start = 180, .end = 199 }
+ },
+ .expected_nr_ranges = 2,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.5: A overlaps B's right part",
+ .exclude_start = 150, .exclude_end = 249,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.6: A's left boundary touches B's right boundary",
+ .exclude_start = 200, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.7: A is right of B, no overlap",
+ .exclude_start = 250, .exclude_end = 300,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.8: A completely covers B and extends beyond",
+ .exclude_start = 50, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.9: A covers B and extends to the left",
+ .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.10: A covers B and extends to the right",
+ .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.11: A is identical to B",
+ .exclude_start = 100, .exclude_end = 199,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.12: A is a point, left of B, no overlap",
+ .exclude_start = 10, .exclude_end = 10,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.13: A is a point, at start of B",
+ .exclude_start = 100, .exclude_end = 100,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.14: A is a point, in middle of B (causes split)",
+ .exclude_start = 150, .exclude_end = 150,
+ .initial_max_ranges = 2, // Needs space for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){
+ { .start = 100, .end = 149 },
+ { .start = 151, .end = 199 }
+ },
+ .expected_nr_ranges = 2,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.15: A is a point, at end of B",
+ .exclude_start = 199, .exclude_end = 199,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.16: A is a point, right of B, no overlap",
+ .exclude_start = 250, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ // ENOMEM case for single range split
+ {
+ .description = "1.17: A completely inside B (split), no space (ENOMEM)",
+ .exclude_start = 120, .exclude_end = 179,
+ .initial_max_ranges = 1, // Not enough for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content
+ .expected_nr_ranges = 1, // Should remain unchanged
+ .expected_ret = -ENOMEM,
+ },
+};
+
+
+static void exclude_single_range_test(struct kunit *test)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) {
+ kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description);
+ run_exclude_test_case(test, &exclude_single_range_test_data[i]);
+ // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case
+ }
+}
+
+/*
+ * Test Strategy 2: Regression test.
+ */
+
+static const struct exclude_test_param exclude_range_regression_test_data[] = {
+ // Test data from commit a2e9a95d2190
+ {
+ .description = "2.1: exclude low 1M",
+ .exclude_start = 0, .exclude_end = (1 << 20) - 1,
+ .initial_max_ranges = 3,
+ .initial_ranges = (const struct range[]){
+ { .start = 0, .end = 0x3efff },
+ { .start = 0x3f000, .end = 0x3ffff },
+ { .start = 0x40000, .end = 0x9ffff }
+ },
+ .initial_nr_ranges = 3,
+ .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u
+ {
+ .description = "2.2: when range out of bound",
+ .exclude_start = 100, .exclude_end = 200,
+ .initial_max_ranges = 3,
+ .initial_ranges = (const struct range[]){
+ { .start = 1, .end = 299 },
+ { .start = 401, .end = 1000 },
+ { .start = 1001, .end = 2000 }
+ },
+ .initial_nr_ranges = 3,
+ .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content
+ .expected_nr_ranges = 3, // Should remain unchanged
+ .expected_ret = -ENOMEM
+ },
+
+};
+
+
+static void exclude_range_regression_test(struct kunit *test)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) {
+ kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description);
+ run_exclude_test_case(test, &exclude_range_regression_test_data[i]);
+ // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case
+ }
+}
+
+/*
+ * KUnit Test Suite
+ */
+static struct kunit_case crash_exclude_mem_range_test_cases[] = {
+ KUNIT_CASE(exclude_single_range_test),
+ KUNIT_CASE(exclude_range_regression_test),
+ {}
+};
+
+static struct kunit_suite crash_exclude_mem_range_suite = {
+ .name = "crash_exclude_mem_range_tests",
+ .test_cases = crash_exclude_mem_range_test_cases,
+ // .init and .exit can be NULL if not needed globally for the suite
+};
+
+kunit_test_suite(crash_exclude_mem_range_suite);
+
+MODULE_DESCRIPTION("crash dump KUnit test suite");
+MODULE_LICENSE("GPL");
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 9e2afad1c615..b7c1c0c92d0c 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -49,7 +49,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,
if (!pages)
return NULL;
for (i = 0; i < count; i++)
- pages[i] = nth_page(page, i);
+ pages[i] = page++;
vaddr = vmap(pages, count, VM_DMA_COHERENT, prot);
kvfree(pages);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5dcf927310fd..8709c69118b5 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1163,15 +1163,15 @@ static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
* set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
* the task can hit this breakpoint right after __replace_page().
*/
- first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+ first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm);
if (first_uprobe)
- set_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_set(MMF_HAS_UPROBES, mm);
ret = set_swbp(&uprobe->arch, vma, vaddr);
if (!ret)
- clear_bit(MMF_RECALC_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_RECALC_UPROBES, mm);
else if (first_uprobe)
- clear_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_HAS_UPROBES, mm);
return ret;
}
@@ -1181,7 +1181,7 @@ static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
- set_bit(MMF_RECALC_UPROBES, &mm->flags);
+ mm_flags_set(MMF_RECALC_UPROBES, mm);
return set_orig_insn(&uprobe->arch, vma, vaddr);
}
@@ -1313,7 +1313,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
/* consult only the "caller", new consumer. */
if (consumer_filter(new, mm))
err = install_breakpoint(uprobe, vma, info->vaddr);
- } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+ } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) {
if (!filter_chain(uprobe, mm))
err |= remove_breakpoint(uprobe, vma, info->vaddr);
}
@@ -1605,7 +1605,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
if (vma->vm_file &&
(vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
- test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
+ mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm))
delayed_ref_ctr_inc(vma);
if (!valid_vma(vma, true))
@@ -1665,12 +1665,12 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
return;
- if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
- test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
+ if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) ||
+ mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm))
return;
if (vma_has_uprobes(vma, start, end))
- set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
+ mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm);
}
static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
@@ -1843,10 +1843,10 @@ void uprobe_end_dup_mmap(void)
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
- set_bit(MMF_HAS_UPROBES, &newmm->flags);
+ if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) {
+ mm_flags_set(MMF_HAS_UPROBES, newmm);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
- set_bit(MMF_RECALC_UPROBES, &newmm->flags);
+ mm_flags_set(MMF_RECALC_UPROBES, newmm);
}
}
@@ -2390,7 +2390,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
return;
}
- clear_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_HAS_UPROBES, mm);
}
static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
@@ -2488,7 +2488,7 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
*is_swbp = -EFAULT;
}
- if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
+ if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm))
mmf_recalc_uprobes(mm);
mmap_read_unlock(mm);
@@ -2869,7 +2869,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
if (!current->mm)
return 0;
- if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) &&
(!current->utask || !current->utask->return_instances))
return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index 343eb97543d5..9f74e8f1c431 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -780,24 +780,29 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
}
#ifdef CONFIG_DEBUG_STACK_USAGE
+#ifdef CONFIG_STACK_GROWSUP
unsigned long stack_not_used(struct task_struct *p)
{
unsigned long *n = end_of_stack(p);
do { /* Skip over canary */
-# ifdef CONFIG_STACK_GROWSUP
n--;
-# else
- n++;
-# endif
} while (!*n);
-# ifdef CONFIG_STACK_GROWSUP
return (unsigned long)end_of_stack(p) - (unsigned long)n;
-# else
+}
+#else /* !CONFIG_STACK_GROWSUP */
+unsigned long stack_not_used(struct task_struct *p)
+{
+ unsigned long *n = end_of_stack(p);
+
+ do { /* Skip over canary */
+ n++;
+ } while (!*n);
+
return (unsigned long)n - (unsigned long)end_of_stack(p);
-# endif
}
+#endif /* CONFIG_STACK_GROWSUP */
/* Count the maximum pages reached in kernel stacks */
static inline void kstack_histogram(unsigned long used_stack)
@@ -856,9 +861,9 @@ static void check_stack_usage(void)
}
spin_unlock(&low_water_lock);
}
-#else
+#else /* !CONFIG_DEBUG_STACK_USAGE */
static inline void check_stack_usage(void) {}
-#endif
+#endif /* CONFIG_DEBUG_STACK_USAGE */
static void synchronize_group_exit(struct task_struct *tsk, long code)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index cffa6157a55a..3da0f08615a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -290,6 +290,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!vm_area)
continue;
+ if (memcg_charge_kernel_stack(vm_area)) {
+ vfree(vm_area->addr);
+ return -ENOMEM;
+ }
+
/* Reset stack metadata. */
kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
@@ -298,11 +303,6 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
/* Clear stale pointers from reused stack. */
memset(stack, 0, THREAD_SIZE);
- if (memcg_charge_kernel_stack(vm_area)) {
- vfree(vm_area->addr);
- return -ENOMEM;
- }
-
tsk->stack_vm_area = vm_area;
tsk->stack = stack;
return 0;
@@ -1057,11 +1057,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_uprobes_state(mm);
hugetlb_count_init(mm);
+ mm_flags_clear_all(mm);
if (current->mm) {
- mm->flags = mmf_init_flags(current->mm->flags);
+ unsigned long flags = __mm_flags_get_word(current->mm);
+
+ __mm_flags_set_word(mm, mmf_init_legacy_flags(flags));
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
- mm->flags = default_dump_filter;
+ __mm_flags_set_word(mm, default_dump_filter);
mm->def_flags = 0;
}
@@ -1889,7 +1892,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
/* We need to synchronize with __set_oom_adj */
mutex_lock(&oom_adj_mutex);
- set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
+ mm_flags_set(MMF_MULTIPROCESS, tsk->mm);
/* Update the values in case they were changed after copy_signal */
tsk->signal->oom_score_adj = current->signal->oom_score_adj;
tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -2129,9 +2132,7 @@ __latent_entropy struct task_struct *copy_process(
p->pagefault_disabled = 0;
-#ifdef CONFIG_LOCKDEP
lockdep_init_task(p);
-#endif
p->blocked_on = NULL; /* not blocked yet */
@@ -2544,11 +2545,9 @@ struct task_struct * __init fork_idle(int cpu)
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
- CLONE_IO;
+ CLONE_IO|CLONE_VM|CLONE_UNTRACED;
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = flags,
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
@@ -2660,9 +2659,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
unsigned long flags)
{
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
.fn = fn,
.fn_arg = arg,
.name = name,
@@ -2678,9 +2676,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
.fn = fn,
.fn_arg = arg,
};
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 6a96149aede9..ddc11a8bd2ea 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -10,6 +10,7 @@
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
+#include <linux/oom.h>
#include <linux/kthread.h>
/* total number of freezing conditions in effect */
@@ -40,7 +41,7 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
- if (test_tsk_thread_flag(p, TIF_MEMDIE))
+ if (tsk_is_oom_victim(p))
return false;
if (pm_nosig_freezing || cgroup_freezing(p))
@@ -206,6 +207,23 @@ void __thaw_task(struct task_struct *p)
wake_up_state(p, TASK_FROZEN);
}
+/*
+ * thaw_process - Thaw a frozen process
+ * @p: the process to be thawed
+ *
+ * Iterate over all threads of @p and call __thaw_task() on each.
+ */
+void thaw_process(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ rcu_read_lock();
+ for_each_thread(p, t) {
+ __thaw_task(t);
+ }
+ rcu_read_unlock();
+}
+
/**
* set_freezable - make %current freezable
*
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 8708a1205f82..b2c1f14b8129 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -95,9 +95,41 @@ static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
+static bool task_is_hung(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+ unsigned int state = READ_ONCE(t->__state);
+
+ /*
+ * skip the TASK_KILLABLE tasks -- these can be killed
+ * skip the TASK_IDLE tasks -- those are genuinely idle
+ * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer
+ */
+ if (!(state & TASK_UNINTERRUPTIBLE) ||
+ (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN)))
+ return false;
+
+ /*
+ * When a freshly created task is scheduled once, changes its state to
+ * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
+ * musn't be checked.
+ */
+ if (unlikely(!switch_count))
+ return false;
+
+ if (switch_count != t->last_switch_count) {
+ t->last_switch_count = switch_count;
+ t->last_switch_time = jiffies;
+ return false;
+ }
+ if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+ return false;
+
+ return true;
+}
#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
-static void debug_show_blocker(struct task_struct *task)
+static void debug_show_blocker(struct task_struct *task, unsigned long timeout)
{
struct task_struct *g, *t;
unsigned long owner, blocker, blocker_type;
@@ -174,41 +206,21 @@ static void debug_show_blocker(struct task_struct *task)
t->pid, rwsem_blocked_by);
break;
}
- sched_show_task(t);
+ /* Avoid duplicated task dump, skip if the task is also hung. */
+ if (!task_is_hung(t, timeout))
+ sched_show_task(t);
return;
}
}
#else
-static inline void debug_show_blocker(struct task_struct *task)
+static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout)
{
}
#endif
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
- unsigned long switch_count = t->nvcsw + t->nivcsw;
-
- /*
- * Ensure the task is not frozen.
- * Also, skip vfork and any other user process that freezer should skip.
- */
- if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN))
- return;
-
- /*
- * When a freshly created task is scheduled once, changes its state to
- * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
- * musn't be checked.
- */
- if (unlikely(!switch_count))
- return;
-
- if (switch_count != t->last_switch_count) {
- t->last_switch_count = switch_count;
- t->last_switch_time = jiffies;
- return;
- }
- if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+ if (!task_is_hung(t, timeout))
return;
/*
@@ -243,7 +255,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
- debug_show_blocker(t);
+ debug_show_blocker(t, timeout);
hung_task_show_lock = true;
if (sysctl_hung_task_all_cpu_backtrace)
@@ -299,7 +311,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
hung_task_show_lock = false;
rcu_read_lock();
for_each_process_thread(g, t) {
- unsigned int state;
if (!max_count--)
goto unlock;
@@ -308,15 +319,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
- /*
- * skip the TASK_KILLABLE tasks -- these can be killed
- * skip the TASK_IDLE tasks -- those are genuinely idle
- */
- state = READ_ONCE(t->__state);
- if ((state & TASK_UNINTERRUPTIBLE) &&
- !(state & TASK_WAKEKILL) &&
- !(state & TASK_NOLOAD))
- check_hung_task(t, timeout);
+
+ check_hung_task(t, timeout);
}
unlock:
rcu_read_unlock();
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index cf4af5728307..2b082a7e24a2 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -264,7 +264,7 @@ static int test_kallsyms_basic_function(void)
char namebuf[KSYM_NAME_LEN];
struct test_stat *stat, *stat2;
- stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL);
+ stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL);
if (!stat)
return -ENOMEM;
stat2 = stat + 1;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 1d85597057e1..6563141f5de9 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -978,6 +978,15 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area,
memcpy(dst_entries, src_entries, bytes_to_move);
entries_moved = bytes_to_move >> entry_size_log;
+ /*
+ * A write memory barrier is required here, to ensure
+ * that the writes from the memcpy() are visible before
+ * the count is updated. Without this, it is possible for
+ * a user to observe a new count value but stale
+ * coverage data.
+ */
+ smp_wmb();
+
switch (mode) {
case KCOV_MODE_TRACE_PC:
WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 31203f0bacaf..fa00b239c5d9 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -233,7 +233,6 @@ struct kimage *do_kimage_alloc_init(void)
if (!image)
return NULL;
- image->head = 0;
image->entry = &image->head;
image->last_entry = &image->head;
image->control_page = ~0; /* By default this does not apply */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 91d46502a817..eb62a9794242 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
}
image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+ image->force_dtb = flags & KEXEC_FILE_FORCE_DTB;
if (cmdline_len) {
image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index ecd1ac210dbd..5083c68c3a4e 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -32,6 +32,22 @@
#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
#define PROP_SUB_FDT "fdt"
+#define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */
+
+/*
+ * KHO uses page->private, which is an unsigned long, to store page metadata.
+ * Use it to store both the magic and the order.
+ */
+union kho_page_info {
+ unsigned long page_private;
+ struct {
+ unsigned int order;
+ unsigned int magic;
+ };
+};
+
+static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private));
+
static bool kho_enable __ro_after_init;
bool kho_is_enabled(void)
@@ -183,11 +199,27 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
return 0;
}
-/* almost as free_reserved_page(), just don't free the page */
-static void kho_restore_page(struct page *page, unsigned int order)
+static struct page *kho_restore_page(phys_addr_t phys)
{
- unsigned int nr_pages = (1 << order);
+ struct page *page = pfn_to_online_page(PHYS_PFN(phys));
+ union kho_page_info info;
+ unsigned int nr_pages;
+
+ if (!page)
+ return NULL;
+ info.page_private = page->private;
+ /*
+ * deserialize_bitmap() only sets the magic on the head page. This magic
+ * check also implicitly makes sure phys is order-aligned since for
+ * non-order-aligned phys addresses, magic will never be set.
+ */
+ if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER))
+ return NULL;
+ nr_pages = (1 << info.order);
+
+ /* Clear private to make sure later restores on this page error out. */
+ page->private = 0;
/* Head page gets refcount of 1. */
set_page_count(page, 1);
@@ -195,10 +227,11 @@ static void kho_restore_page(struct page *page, unsigned int order)
for (unsigned int i = 1; i < nr_pages; i++)
set_page_count(page + i, 0);
- if (order > 0)
- prep_compound_page(page, order);
+ if (info.order > 0)
+ prep_compound_page(page, info.order);
adjust_managed_page_count(page, nr_pages);
+ return page;
}
/**
@@ -209,18 +242,9 @@ static void kho_restore_page(struct page *page, unsigned int order)
*/
struct folio *kho_restore_folio(phys_addr_t phys)
{
- struct page *page = pfn_to_online_page(PHYS_PFN(phys));
- unsigned long order;
+ struct page *page = kho_restore_page(phys);
- if (!page)
- return NULL;
-
- order = page->private;
- if (order > MAX_PAGE_ORDER)
- return NULL;
-
- kho_restore_page(page, order);
- return page_folio(page);
+ return page ? page_folio(page) : NULL;
}
EXPORT_SYMBOL_GPL(kho_restore_folio);
@@ -341,10 +365,13 @@ static void __init deserialize_bitmap(unsigned int order,
phys_addr_t phys =
elm->phys_start + (bit << (order + PAGE_SHIFT));
struct page *page = phys_to_page(phys);
+ union kho_page_info info;
memblock_reserve(phys, sz);
memblock_reserved_mark_noinit(phys, sz);
- page->private = order;
+ info.magic = KHO_PAGE_MAGIC;
+ info.order = order;
+ page->private = info.page_private;
}
}
@@ -405,6 +432,7 @@ static int __init kho_parse_scratch_size(char *p)
{
size_t len;
unsigned long sizes[3];
+ size_t total_size = 0;
int i;
if (!p)
@@ -441,11 +469,19 @@ static int __init kho_parse_scratch_size(char *p)
}
sizes[i] = memparse(p, &endp);
- if (!sizes[i] || endp == p)
+ if (endp == p)
return -EINVAL;
p = endp;
+ total_size += sizes[i];
}
+ if (!total_size)
+ return -EINVAL;
+
+ /* The string should be fully consumed by now. */
+ if (*p)
+ return -EINVAL;
+
scratch_size_lowmem = sizes[0];
scratch_size_global = sizes[1];
scratch_size_pernode = sizes[2];
@@ -952,6 +988,26 @@ static const void *kho_get_fdt(void)
}
/**
+ * is_kho_boot - check if current kernel was booted via KHO-enabled
+ * kexec
+ *
+ * This function checks if the current kernel was loaded through a kexec
+ * operation with KHO enabled, by verifying that a valid KHO FDT
+ * was passed.
+ *
+ * Note: This function returns reliable results only after
+ * kho_populate() has been called during early boot. Before that,
+ * it may return false even if KHO data is present.
+ *
+ * Return: true if booted via KHO-enabled kexec, false otherwise
+ */
+bool is_kho_boot(void)
+{
+ return !!kho_get_fdt();
+}
+EXPORT_SYMBOL_GPL(is_kho_boot);
+
+/**
* kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
* @name: the name of the sub FDT passed to kho_add_subtree().
* @phys: if found, the physical address of the sub FDT is stored in @phys.
@@ -1233,7 +1289,7 @@ int kho_fill_kimage(struct kimage *image)
int err = 0;
struct kexec_buf scratch;
- if (!kho_enable)
+ if (!kho_out.finalized)
return 0;
image->kho.fdt = page_to_phys(kho_out.ser.fdt);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 78dd3d8c6554..cf6ddd1b23a2 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -153,15 +153,6 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
pi_tree.entry);
}
-#define RT_MUTEX_HAS_WAITERS 1UL
-
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
-{
- unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
-
- return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
-}
-
/*
* Constants for rt mutex functions which have a selectable deadlock
* detection.
diff --git a/kernel/panic.c b/kernel/panic.c
index 72fcbb5a071b..24cc3eec1805 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -53,7 +53,7 @@ static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
#define sysctl_oops_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
-int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
+int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS);
static unsigned long tainted_mask =
IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
@@ -67,6 +67,7 @@ static unsigned int warn_limit __read_mostly;
static bool panic_console_replay;
bool panic_triggering_all_cpu_backtrace;
+static bool panic_this_cpu_backtrace_printed;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -77,6 +78,11 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
+static void panic_print_deprecated(void)
+{
+ pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n");
+}
+
#ifdef CONFIG_SYSCTL
/*
@@ -125,7 +131,7 @@ static int proc_taint(const struct ctl_table *table, int write,
static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n");
+ panic_print_deprecated();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
@@ -294,6 +300,59 @@ void __weak crash_smp_send_stop(void)
atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+bool panic_try_start(void)
+{
+ int old_cpu, this_cpu;
+
+ /*
+ * Only one CPU is allowed to execute the crash_kexec() code as with
+ * panic(). Otherwise parallel calls of panic() and crash_kexec()
+ * may stop each other. To exclude them, we use panic_cpu here too.
+ */
+ old_cpu = PANIC_CPU_INVALID;
+ this_cpu = raw_smp_processor_id();
+
+ return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu);
+}
+EXPORT_SYMBOL(panic_try_start);
+
+void panic_reset(void)
+{
+ atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+}
+EXPORT_SYMBOL(panic_reset);
+
+bool panic_in_progress(void)
+{
+ return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+EXPORT_SYMBOL(panic_in_progress);
+
+/* Return true if a panic is in progress on the current CPU. */
+bool panic_on_this_cpu(void)
+{
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
+}
+EXPORT_SYMBOL(panic_on_this_cpu);
+
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool panic_on_other_cpu(void)
+{
+ return (panic_in_progress() && !panic_on_this_cpu());
+}
+EXPORT_SYMBOL(panic_on_other_cpu);
+
/*
* A variant of panic() called from NMI context. We return if we've already
* panicked on this CPU. If another CPU already panicked, loop in
@@ -302,15 +361,9 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
*/
void nmi_panic(struct pt_regs *regs, const char *msg)
{
- int old_cpu, this_cpu;
-
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
- /* atomic_try_cmpxchg updates old_cpu on failure */
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu))
+ if (panic_try_start())
panic("%s", msg);
- else if (old_cpu != this_cpu)
+ else if (panic_on_other_cpu())
nmi_panic_self_stop(regs);
}
EXPORT_SYMBOL(nmi_panic);
@@ -328,6 +381,19 @@ void check_panic_on_warn(const char *origin)
origin, limit);
}
+static void panic_trigger_all_cpu_backtrace(void)
+{
+ /* Temporary allow non-panic CPUs to write their backtraces. */
+ panic_triggering_all_cpu_backtrace = true;
+
+ if (panic_this_cpu_backtrace_printed)
+ trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id());
+ else
+ trigger_all_cpu_backtrace();
+
+ panic_triggering_all_cpu_backtrace = false;
+}
+
/*
* Helper that triggers the NMI backtrace (if set in panic_print)
* and then performs the secondary CPUs shutdown - we cannot have
@@ -335,12 +401,8 @@ void check_panic_on_warn(const char *origin)
*/
static void panic_other_cpus_shutdown(bool crash_kexec)
{
- if (panic_print & SYS_INFO_ALL_CPU_BT) {
- /* Temporary allow non-panic CPUs to write their backtraces. */
- panic_triggering_all_cpu_backtrace = true;
- trigger_all_cpu_backtrace();
- panic_triggering_all_cpu_backtrace = false;
- }
+ if (panic_print & SYS_INFO_ALL_CPU_BT)
+ panic_trigger_all_cpu_backtrace();
/*
* Note that smp_send_stop() is the usual SMP shutdown function,
@@ -368,7 +430,6 @@ void vpanic(const char *fmt, va_list args)
static char buf[1024];
long i, i_next = 0, len;
int state = 0;
- int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
if (panic_on_warn) {
@@ -405,13 +466,10 @@ void vpanic(const char *fmt, va_list args)
* `old_cpu == this_cpu' means we came from nmi_panic() which sets
* panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
/* atomic_try_cmpxchg updates old_cpu on failure */
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ if (panic_try_start()) {
/* go ahead */
- } else if (old_cpu != this_cpu)
+ } else if (panic_on_other_cpu())
panic_smp_self_stop();
console_verbose();
@@ -422,13 +480,15 @@ void vpanic(const char *fmt, va_list args)
buf[len - 1] = '\0';
pr_emerg("Kernel panic - not syncing: %s\n", buf);
-#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
+ if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
+ panic_this_cpu_backtrace_printed = true;
+ } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
dump_stack();
-#endif
+ panic_this_cpu_backtrace_printed = true;
+ }
/*
* If kgdb is enabled, give it a chance to run before we stop all
@@ -937,12 +997,29 @@ EXPORT_SYMBOL(__stack_chk_fail);
#endif
core_param(panic, panic_timeout, int, 0644);
-core_param(panic_print, panic_print, ulong, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
core_param(panic_console_replay, panic_console_replay, bool, 0644);
+static int panic_print_set(const char *val, const struct kernel_param *kp)
+{
+ panic_print_deprecated();
+ return param_set_ulong(val, kp);
+}
+
+static int panic_print_get(char *val, const struct kernel_param *kp)
+{
+ panic_print_deprecated();
+ return param_get_ulong(val, kp);
+}
+
+static const struct kernel_param_ops panic_print_ops = {
+ .set = panic_print_set,
+ .get = panic_print_get,
+};
+__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644);
+
static int __init oops_setup(char *s)
{
if (!s)
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index ef282001f200..f72bbfa266d6 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -332,7 +332,6 @@ struct printk_message {
unsigned long dropped;
};
-bool other_cpu_in_panic(void);
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
bool is_extended, bool may_supress);
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 646801813415..558ef3177976 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -12,6 +12,7 @@
#include <linux/irqflags.h>
#include <linux/kthread.h>
#include <linux/minmax.h>
+#include <linux/panic.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/slab.h>
@@ -254,7 +255,7 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
* opportunity to perform any necessary cleanup if they were
* interrupted by the panic CPU while printing.
*/
- if (other_cpu_in_panic() &&
+ if (panic_on_other_cpu() &&
(!is_reacquire || cur->unsafe_takeover)) {
return -EPERM;
}
@@ -309,7 +310,7 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
* Event #2 implies the new context is PANIC.
* Event #3 occurs when panic() has flushed the console.
* Event #4 occurs when a non-panic CPU reacquires.
- * Event #5 is not possible due to the other_cpu_in_panic() check
+ * Event #5 is not possible due to the panic_on_other_cpu() check
* in nbcon_context_try_acquire_handover().
*/
@@ -348,7 +349,7 @@ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
struct nbcon_state new;
/* Note that the caller must still remove the request! */
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
return -EPERM;
/*
@@ -446,7 +447,7 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
* nbcon_waiter_matches(). In particular, the assumption that
* lower priorities are ignored during panic.
*/
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
return -EPERM;
/* Handover is not possible on the same CPU. */
@@ -589,7 +590,6 @@ static struct printk_buffers panic_nbcon_pbufs;
*/
static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire)
{
- unsigned int cpu = smp_processor_id();
struct console *con = ctxt->console;
struct nbcon_state cur;
int err;
@@ -614,7 +614,7 @@ out:
/* Acquire succeeded. */
/* Assign the appropriate buffer for this context. */
- if (atomic_read(&panic_cpu) == cpu)
+ if (panic_on_this_cpu())
ctxt->pbufs = &panic_nbcon_pbufs;
else
ctxt->pbufs = con->pbufs;
@@ -1394,7 +1394,7 @@ enum nbcon_prio nbcon_get_default_prio(void)
{
unsigned int *cpu_emergency_nesting;
- if (this_cpu_in_panic())
+ if (panic_on_this_cpu())
return NBCON_PRIO_PANIC;
cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 0efbcdda9aab..5aee9ffb16b9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -48,6 +48,7 @@
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
+#include <linux/panic.h>
#include <linux/uaccess.h>
#include <asm/sections.h>
@@ -345,34 +346,6 @@ static void __up_console_sem(unsigned long ip)
}
#define up_console_sem() __up_console_sem(_RET_IP_)
-static bool panic_in_progress(void)
-{
- return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
-}
-
-/* Return true if a panic is in progress on the current CPU. */
-bool this_cpu_in_panic(void)
-{
- /*
- * We can use raw_smp_processor_id() here because it is impossible for
- * the task to be migrated to the panic_cpu, or away from it. If
- * panic_cpu has already been set, and we're not currently executing on
- * that CPU, then we never will be.
- */
- return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
-}
-
-/*
- * Return true if a panic is in progress on a remote CPU.
- *
- * On true, the local CPU should immediately release any printing resources
- * that may be needed by the panic CPU.
- */
-bool other_cpu_in_panic(void)
-{
- return (panic_in_progress() && !this_cpu_in_panic());
-}
-
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
@@ -2407,7 +2380,7 @@ asmlinkage int vprintk_emit(int facility, int level,
* non-panic CPUs are generating any messages, they will be
* silently dropped.
*/
- if (other_cpu_in_panic() &&
+ if (panic_on_other_cpu() &&
!debug_non_panic_cpus &&
!panic_triggering_all_cpu_backtrace)
return 0;
@@ -2843,7 +2816,7 @@ void console_lock(void)
might_sleep();
/* On panic, the console_lock must be left to the panic cpu. */
- while (other_cpu_in_panic())
+ while (panic_on_other_cpu())
msleep(1000);
down_console_sem();
@@ -2863,7 +2836,7 @@ EXPORT_SYMBOL(console_lock);
int console_trylock(void)
{
/* On panic, the console_lock must be left to the panic cpu. */
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
return 0;
if (down_trylock_console_sem())
return 0;
@@ -3243,7 +3216,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
any_progress = true;
/* Allow panic_cpu to take over the consoles safely. */
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
goto abandon;
if (do_cond_resched)
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index d9fb053cff67..e2a1b2d34d2b 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -2143,7 +2143,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
* But it would have the sequence number returned
* by "prb_next_reserve_seq() - 1".
*/
- if (this_cpu_in_panic() &&
+ if (panic_on_this_cpu() &&
(!debug_non_panic_cpus || legacy_allow_panic_sync) &&
((*seq + 1) < prb_next_reserve_seq(rb))) {
(*seq)++;
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index b521d0455992..7484d8ad5767 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -796,7 +796,7 @@ kfree_scale_thread(void *arg)
pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n",
(unsigned long long)(end_time - start_time), kfree_loops,
rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
- (mem_begin - mem_during) >> (20 - PAGE_SHIFT));
+ PAGES_TO_MB(mem_begin - mem_during));
if (shutdown) {
smp_mb(); /* Assign before wake. */
diff --git a/kernel/resource.c b/kernel/resource.c
index f9bb5481501a..b9fa2a4ce089 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1388,6 +1388,47 @@ void __release_region(struct resource *parent, resource_size_t start,
EXPORT_SYMBOL(__release_region);
#ifdef CONFIG_MEMORY_HOTREMOVE
+static void append_child_to_parent(struct resource *new_parent, struct resource *new_child)
+{
+ struct resource *child;
+
+ child = new_parent->child;
+ if (child) {
+ while (child->sibling)
+ child = child->sibling;
+ child->sibling = new_child;
+ } else {
+ new_parent->child = new_child;
+ }
+ new_child->parent = new_parent;
+ new_child->sibling = NULL;
+}
+
+/*
+ * Reparent all child resources that no longer belong to "low" after a split to
+ * "high". Note that "high" does not have any children, because "low" is the
+ * original resource and "high" is a new resource. Treat "low" as the original
+ * resource being split and defer its range adjustment to __adjust_resource().
+ */
+static void reparent_children_after_split(struct resource *low,
+ struct resource *high,
+ resource_size_t split_addr)
+{
+ struct resource *child, *next, **p;
+
+ p = &low->child;
+ while ((child = *p)) {
+ next = child->sibling;
+ if (child->start > split_addr) {
+ /* unlink child */
+ *p = next;
+ append_child_to_parent(high, child);
+ } else {
+ p = &child->sibling;
+ }
+ }
+}
+
/**
* release_mem_region_adjustable - release a previously reserved memory region
* @start: resource start address
@@ -1397,15 +1438,13 @@ EXPORT_SYMBOL(__release_region);
* is released from a currently busy memory resource. The requested region
* must either match exactly or fit into a single busy resource entry. In
* the latter case, the remaining resource is adjusted accordingly.
- * Existing children of the busy memory resource must be immutable in the
- * request.
*
* Note:
* - Additional release conditions, such as overlapping region, can be
* supported after they are confirmed as valid cases.
- * - When a busy memory resource gets split into two entries, the code
- * assumes that all children remain in the lower address entry for
- * simplicity. Enhance this logic when necessary.
+ * - When a busy memory resource gets split into two entries, its children are
+ * reassigned to the correct parent based on their range. If a child memory
+ * resource overlaps with more than one parent, enhance the logic as needed.
*/
void release_mem_region_adjustable(resource_size_t start, resource_size_t size)
{
@@ -1482,6 +1521,7 @@ retry:
new_res->parent = res->parent;
new_res->sibling = res->sibling;
new_res->child = NULL;
+ reparent_children_after_split(res, new_res, end);
if (WARN_ON_ONCE(__adjust_resource(res, res->start,
start - res->start)))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3a89f949e307..bc0b7ce8a65d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1495,7 +1495,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
* by the PTE scanner and NUMA hinting faults should be trapped based
* on resident pages
*/
- nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size);
rss = get_mm_rss(p->mm);
if (!rss)
rss = nr_scan_pages;
@@ -1923,17 +1923,18 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
struct pglist_data *pgdat;
unsigned long rate_limit;
unsigned int latency, th, def_th;
+ long nr = folio_nr_pages(folio);
pgdat = NODE_DATA(dst_nid);
if (pgdat_free_space_enough(pgdat)) {
/* workload changed, reset hot threshold */
pgdat->nbp_threshold = 0;
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr);
return true;
}
def_th = sysctl_numa_balancing_hot_threshold;
- rate_limit = sysctl_numa_balancing_promote_rate_limit << \
- (20 - PAGE_SHIFT);
+ rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit);
numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
th = pgdat->nbp_threshold ? : def_th;
@@ -1941,8 +1942,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
if (latency >= th)
return false;
- return !numa_promotion_rate_limit(pgdat, rate_limit,
- folio_nr_pages(folio));
+ return !numa_promotion_rate_limit(pgdat, rate_limit, nr);
}
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
diff --git a/kernel/sys.c b/kernel/sys.c
index 1e28b40053ce..8b58eece4e58 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1734,6 +1734,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
struct rlimit old, new;
struct task_struct *tsk;
unsigned int checkflags = 0;
+ bool need_tasklist;
int ret;
if (old_rlim)
@@ -1760,8 +1761,25 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
get_task_struct(tsk);
rcu_read_unlock();
- ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
- old_rlim ? &old : NULL);
+ need_tasklist = !same_thread_group(tsk, current);
+ if (need_tasklist) {
+ /*
+ * Ensure we can't race with group exit or de_thread(),
+ * so tsk->group_leader can't be freed or changed until
+ * read_unlock(tasklist_lock) below.
+ */
+ read_lock(&tasklist_lock);
+ if (!pid_alive(tsk))
+ ret = -ESRCH;
+ }
+
+ if (!ret) {
+ ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
+ old_rlim ? &old : NULL);
+ }
+
+ if (need_tasklist)
+ read_unlock(&tasklist_lock);
if (!ret && old_rlim) {
rlim_to_rlim64(&old, &old64);
@@ -2392,9 +2410,9 @@ static inline unsigned long get_current_mdwe(void)
{
unsigned long ret = 0;
- if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ if (mm_flags_test(MMF_HAS_MDWE, current->mm))
ret |= PR_MDWE_REFUSE_EXEC_GAIN;
- if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags))
+ if (mm_flags_test(MMF_HAS_MDWE_NO_INHERIT, current->mm))
ret |= PR_MDWE_NO_INHERIT;
return ret;
@@ -2427,9 +2445,9 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
return -EPERM; /* Cannot unset the flags */
if (bits & PR_MDWE_NO_INHERIT)
- set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags);
+ mm_flags_set(MMF_HAS_MDWE_NO_INHERIT, current->mm);
if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
- set_bit(MMF_HAS_MDWE, &current->mm->flags);
+ mm_flags_set(MMF_HAS_MDWE, current->mm);
return 0;
}
@@ -2452,6 +2470,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len)
return sizeof(mm->saved_auxv);
}
+static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ /* If disabled, we return "1 | flags", otherwise 0. */
+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
+ return 1;
+ else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm))
+ return 1 | PR_THP_DISABLE_EXCEPT_ADVISED;
+ return 0;
+}
+
+static int prctl_set_thp_disable(bool thp_disable, unsigned long flags,
+ unsigned long arg4, unsigned long arg5)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (arg4 || arg5)
+ return -EINVAL;
+
+ /* Flags are only allowed when disabling. */
+ if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED))
+ return -EINVAL;
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+ if (thp_disable) {
+ if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) {
+ mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ } else {
+ mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ }
+ } else {
+ mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ }
+ mmap_write_unlock(current->mm);
+ return 0;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2470,7 +2533,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = -EINVAL;
break;
}
+ /*
+ * Ensure that either:
+ *
+ * 1. Subsequent getppid() calls reflect the parent process having died.
+ * 2. forget_original_parent() will send the new me->pdeath_signal.
+ *
+ * Also prevent the read of me->pdeath_signal from being a data race.
+ */
+ read_lock(&tasklist_lock);
me->pdeath_signal = arg2;
+ read_unlock(&tasklist_lock);
break;
case PR_GET_PDEATHSIG:
error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2625,20 +2698,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
return task_no_new_privs(current) ? 1 : 0;
case PR_GET_THP_DISABLE:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
+ error = prctl_get_thp_disable(arg2, arg3, arg4, arg5);
break;
case PR_SET_THP_DISABLE:
- if (arg3 || arg4 || arg5)
- return -EINVAL;
- if (mmap_write_lock_killable(me->mm))
- return -EINTR;
- if (arg2)
- set_bit(MMF_DISABLE_THP, &me->mm->flags);
- else
- clear_bit(MMF_DISABLE_THP, &me->mm->flags);
- mmap_write_unlock(me->mm);
+ error = prctl_set_thp_disable(arg2, arg3, arg4, arg5);
break;
case PR_MPX_ENABLE_MANAGEMENT:
case PR_MPX_DISABLE_MANAGEMENT:
@@ -2770,7 +2833,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+ error = !!mm_flags_test(MMF_VM_MERGE_ANY, me->mm);
break;
#endif
case PR_RISCV_V_SET_CONTROL:
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 1b69caa87480..0ba8e3c50d62 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -858,6 +858,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
return res;
}
+EXPORT_SYMBOL_GPL(timespec64_add_safe);
/**
* get_timespec64 - get user's time value into kernel space
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 80b56c002c7f..5b62d1002783 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -425,7 +425,11 @@ static DEFINE_PER_CPU(u8, cpustat_tail);
*/
static u16 get_16bit_precision(u64 data_ns)
{
- return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */
+ /*
+ * 2^24ns ~= 16.8ms
+ * Round to the nearest multiple of 16.8 milliseconds.
+ */
+ return (data_ns + (1 << 23)) >> 24LL;
}
static void update_cpustat(void)
@@ -444,6 +448,14 @@ static void update_cpustat(void)
old_stat = __this_cpu_read(cpustat_old[i]);
new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
+ /*
+ * Since we use 16-bit precision, the raw data will undergo
+ * integer division, which may sometimes result in data loss,
+ * and then result might exceed 100%. To avoid confusion,
+ * we enforce a 100% display cap when calculations exceed this threshold.
+ */
+ if (util > 100)
+ util = 100;
__this_cpu_write(cpustat_util[tail][i], util);
__this_cpu_write(cpustat_old[i], new_stat);
}
@@ -455,17 +467,17 @@ static void print_cpustat(void)
{
int i, group;
u8 tail = __this_cpu_read(cpustat_tail);
- u64 sample_period_second = sample_period;
+ u64 sample_period_msecond = sample_period;
- do_div(sample_period_second, NSEC_PER_SEC);
+ do_div(sample_period_msecond, NSEC_PER_MSEC);
/*
* Outputting the "watchdog" prefix on every line is redundant and not
* concise, and the original alarm information is sufficient for
* positioning in logs, hence here printk() is used instead of pr_crit().
*/
- printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n",
- smp_processor_id(), sample_period_second);
+ printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n",
+ smp_processor_id(), sample_period_msecond);
for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
group = (tail + i) % NUM_SAMPLE_PERIODS;
@@ -740,6 +752,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (!watchdog_enabled)
return HRTIMER_NORESTART;
+ /*
+ * pass the buddy check if a panic is in process
+ */
+ if (panic_in_progress())
+ return HRTIMER_NORESTART;
+
watchdog_hardlockup_kick();
/* kick the softlockup detector */
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 9c58f5b4381d..d3ca70e3c256 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) "NMI watchdog: " fmt
+#include <linux/panic.h>
#include <linux/nmi.h>
#include <linux/atomic.h>
#include <linux/module.h>
@@ -108,6 +109,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
+ if (panic_in_progress())
+ return;
+
if (!watchdog_check_timestamp())
return;