From 59ec71575ab440cd5ca0aa53b2a2985b3639fad4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 29 Nov 2021 21:37:25 +0100 Subject: ucounts: Fix rlimit max values check The semantics of the rlimit max values differs from ucounts itself. When creating a new userns, we store the current rlimit of the process in ucount_max. Thus, the value of the limit in the parent userns is saved in the created one. The problem is that now we are taking the maximum value for counter from the same userns. So for init_user_ns it will always be RLIM_INFINITY. To fix the problem we need to check the counter value with the max value stored in userns. Reproducer: su - test -c "ulimit -u 3; sleep 5 & sleep 6 & unshare -U --map-root-user sh -c 'sleep 7 & sleep 8 & date; wait'" Before: [1] 175 [2] 176 Fri Nov 26 13:48:20 UTC 2021 [1]- Done sleep 5 [2]+ Done sleep 6 After: [1] 167 [2] 168 sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: Interrupted system call [1]- Done sleep 5 [2]+ Done sleep 6 Fixes: c54b245d0118 ("Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace") Reported-by: Gleb Fotengauer-Malinovskiy Signed-off-by: "Eric W. Biederman" Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/024ec805f6e16896f0b23e094773790d171d2c1c.1638218242.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 4f5613dac227..7b32c356ebc5 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -264,15 +264,16 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) { struct ucounts *iter; + long max = LONG_MAX; long ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long max = READ_ONCE(iter->ns->ucount_max[type]); long new = atomic_long_add_return(v, &iter->ucount[type]); if (new < 0 || new > max) ret = LONG_MAX; else if (iter == ucounts) ret = new; + max = READ_ONCE(iter->ns->ucount_max[type]); } return ret; } @@ -312,15 +313,16 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; + long max = LONG_MAX; long dec, ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long max = READ_ONCE(iter->ns->ucount_max[type]); long new = atomic_long_add_return(1, &iter->ucount[type]); if (new < 0 || new > max) goto unwind; if (iter == ucounts) ret = new; + max = READ_ONCE(iter->ns->ucount_max[type]); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. @@ -339,15 +341,16 @@ unwind: return 0; } -bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) +bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long rlimit) { struct ucounts *iter; - if (get_ucounts_value(ucounts, type) > max) - return true; + long max = rlimit; + if (rlimit > LONG_MAX) + max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - max = READ_ONCE(iter->ns->ucount_max[type]); if (get_ucounts_value(iter, type) > max) return true; + max = READ_ONCE(iter->ns->ucount_max[type]); } return false; } -- cgit v1.2.3 From 6c3118c32129b4197999a8928ba776bcabd0f5c4 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Fri, 10 Dec 2021 14:55:03 -0800 Subject: signal: Skip the altstack update when not needed == Background == Support for large, "dynamic" fpstates was recently merged. This included code to ensure that sigaltstacks are sufficiently sized for these large states. A new lock was added to remove races between enabling large features and setting up sigaltstacks. == Problem == The new lock (sigaltstack_lock()) is acquired in the sigreturn path before restoring the old sigaltstack. Unfortunately, contention on the new lock causes a measurable signal handling performance regression [1]. However, the common case is that no *changes* are made to the sigaltstack state at sigreturn. == Solution == do_sigaltstack() acquires sigaltstack_lock() and is used for both sys_sigaltstack() and restoring the sigaltstack in sys_sigreturn(). Check for changes to the sigaltstack before taking the lock. If no changes were made, return before acquiring the lock. This removes lock contention from the common-case sigreturn path. [1] https://lore.kernel.org/lkml/20211207012128.GA16074@xsang-OptiPlex-9020/ Fixes: 3aac3ebea08f ("x86/signal: Implement sigaltstack size validation") Reported-by: kernel test robot Signed-off-by: Chang S. Bae Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20211210225503.12734-1-chang.seok.bae@intel.com --- kernel/signal.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a629b11bf3e0..dfcee3888b00 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4185,6 +4185,15 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, ss_mode != 0)) return -EINVAL; + /* + * Return before taking any locks if no actual + * sigaltstack changes were requested. + */ + if (t->sas_ss_sp == (unsigned long)ss_sp && + t->sas_ss_size == ss_size && + t->sas_ss_flags == ss_flags) + return 0; + sigaltstack_lock(); if (ss_mode == SS_DISABLE) { ss_size = 0; -- cgit v1.2.3 From 4e8c11b6b3f0b6a283e898344f154641eda94266 Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Mon, 13 Dec 2021 21:57:27 +0800 Subject: timekeeping: Really make sure wall_to_monotonic isn't positive Even after commit e1d7ba873555 ("time: Always make sure wall_to_monotonic isn't positive") it is still possible to make wall_to_monotonic positive by running the following code: int main(void) { struct timespec time; clock_gettime(CLOCK_MONOTONIC, &time); time.tv_nsec = 0; clock_settime(CLOCK_REALTIME, &time); return 0; } The reason is that the second parameter of timespec64_compare(), ts_delta, may be unnormalized because the delta is calculated with an open coded substraction which causes the comparison of tv_sec to yield the wrong result: wall_to_monotonic = { .tv_sec = -10, .tv_nsec = 900000000 } ts_delta = { .tv_sec = -9, .tv_nsec = -900000000 } That makes timespec64_compare() claim that wall_to_monotonic < ts_delta, but actually the result should be wall_to_monotonic > ts_delta. After normalization, the result of timespec64_compare() is correct because the tv_sec comparison is not longer misleading: wall_to_monotonic = { .tv_sec = -10, .tv_nsec = 900000000 } ts_delta = { .tv_sec = -10, .tv_nsec = 100000000 } Use timespec64_sub() to ensure that ts_delta is normalized, which fixes the issue. Fixes: e1d7ba873555 ("time: Always make sure wall_to_monotonic isn't positive") Signed-off-by: Yu Liao Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211213135727.1656662-1-liaoyu15@huawei.com --- kernel/time/timekeeping.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b348749a9fc6..dcdcb85121e4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1306,8 +1306,7 @@ int do_settimeofday64(const struct timespec64 *ts) timekeeping_forward_now(tk); xt = tk_xtime(tk); - ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; - ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; + ts_delta = timespec64_sub(*ts, xt); if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { ret = -EINVAL; -- cgit v1.2.3 From 8f556a326c93213927e683fc32bbf5be1b62540a Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 17 Dec 2021 15:42:07 +0800 Subject: locking/rtmutex: Fix incorrect condition in rtmutex_spin_on_owner() Optimistic spinning needs to be terminated when the spinning waiter is not longer the top waiter on the lock, but the condition is negated. It terminates if the waiter is the top waiter, which is defeating the whole purpose. Fixes: c3123c431447 ("locking/rtmutex: Dont dereference waiter lockless") Signed-off-by: Zqiang Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211217074207.77425-1-qiang1.zhang@intel.com --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 0c6a48dfcecb..1f25a4d7de27 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1380,7 +1380,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * - the VCPU on which owner runs is preempted */ if (!owner->on_cpu || need_resched() || - rt_mutex_waiter_is_top_waiter(lock, waiter) || + !rt_mutex_waiter_is_top_waiter(lock, waiter) || vcpu_is_preempted(task_cpu(owner))) { res = false; break; -- cgit v1.2.3 From 71d2bcec2d4d69ff109c497e6611d6c53c8926d4 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 24 Dec 2021 21:12:39 -0800 Subject: kernel/crash_core: suppress unknown crashkernel parameter warning When booting with crashkernel= on the kernel command line a warning similar to Kernel command line: ro console=ttyS0 crashkernel=256M Unknown kernel command line parameters "crashkernel=256M", will be passed to user space. is printed. This comes from crashkernel= being parsed independent from the kernel parameter handling mechanism. So the code in init/main.c doesn't know that crashkernel= is a valid kernel parameter and prints this incorrect warning. Suppress the warning by adding a dummy early_param handler for crashkernel=. Link: https://lkml.kernel.org/r/20211208133443.6867-1-prudo@redhat.com Fixes: 86d1919a4fb0 ("init: print out unknown kernel parameters") Signed-off-by: Philipp Rudo Acked-by: Baoquan He Cc: Andrew Halaney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index eb53f5ec62c9..256cf6db573c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -295,6 +296,16 @@ int __init parse_crashkernel_low(char *cmdline, "crashkernel=", suffix_tbl[SUFFIX_LOW]); } +/* + * Add a dummy early_param handler to mark crashkernel= as a known command line + * parameter and suppress incorrect warnings in init/main.c. + */ +static int __init parse_crashkernel_dummy(char *arg) +{ + return 0; +} +early_param("crashkernel", parse_crashkernel_dummy); + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { -- cgit v1.2.3