summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec9
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c29
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/auditsc.c43
-rw-r--r--kernel/bpf/bpf_iter.c29
-rw-r--r--kernel/bpf/helpers.c53
-rw-r--r--kernel/bpf/liveness.c4
-rw-r--r--kernel/bpf/ringbuf.c2
-rw-r--r--kernel/bpf/stackmap.c4
-rw-r--r--kernel/bpf/stream.c3
-rw-r--r--kernel/bpf/syscall.c19
-rw-r--r--kernel/bpf/token.c47
-rw-r--r--kernel/bpf/trampoline.c5
-rw-r--r--kernel/bpf/verifier.c18
-rw-r--r--kernel/cgroup/cgroup.c23
-rw-r--r--kernel/cgroup/cpuset.c17
-rw-r--r--kernel/cgroup/legacy_freezer.c2
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/cred.c33
-rw-r--r--kernel/dma/debug.c5
-rw-r--r--kernel/dma/direct.c1
-rw-r--r--kernel/entry/common.c39
-rw-r--r--kernel/entry/syscall-common.c8
-rw-r--r--kernel/events/callchain.c14
-rw-r--r--kernel/events/core.c108
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--kernel/exit.c11
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex/core.c16
-rw-r--r--kernel/futex/futex.h58
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/irq/chip.c35
-rw-r--r--kernel/irq/handle.c10
-rw-r--r--kernel/irq/irqdesc.c24
-rw-r--r--kernel/irq/irqdomain.c32
-rw-r--r--kernel/irq/manage.c176
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/kexec_handover.c95
-rw-r--r--kernel/kexec_handover_debug.c25
-rw-r--r--kernel/kexec_handover_internal.h20
-rw-r--r--kernel/kthread.c15
-rw-r--r--kernel/livepatch/Kconfig12
-rw-r--r--kernel/livepatch/core.c8
-rw-r--r--kernel/locking/mutex-debug.c10
-rw-r--r--kernel/locking/mutex.c28
-rw-r--r--kernel/locking/mutex.h5
-rw-r--r--kernel/locking/rtmutex_api.c19
-rw-r--r--kernel/locking/spinlock_debug.c4
-rw-r--r--kernel/module/main.c17
-rw-r--r--kernel/nscommon.c246
-rw-r--r--kernel/nsproxy.c57
-rw-r--r--kernel/nstree.c782
-rw-r--r--kernel/padata.c12
-rw-r--r--kernel/panic.c16
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/Makefile4
-rw-r--r--kernel/power/console.c8
-rw-r--r--kernel/power/em_netlink.c308
-rw-r--r--kernel/power/em_netlink.h39
-rw-r--r--kernel/power/em_netlink_autogen.c48
-rw-r--r--kernel/power/em_netlink_autogen.h23
-rw-r--r--kernel/power/energy_model.c90
-rw-r--r--kernel/power/hibernate.c19
-rw-r--r--kernel/power/main.c103
-rw-r--r--kernel/power/power.h1
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/qos.c106
-rw-r--r--kernel/power/snapshot.c13
-rw-r--r--kernel/power/suspend.c16
-rw-r--r--kernel/power/swap.c278
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/rcu/tiny.c8
-rw-r--r--kernel/rcu/tree.c14
-rw-r--r--kernel/rcu/tree_exp.h3
-rw-r--r--kernel/rcu/tree_plugin.h9
-rw-r--r--kernel/rcu/tree_stall.h3
-rw-r--r--kernel/rseq.c655
-rw-r--r--kernel/sched/core.c1230
-rw-r--r--kernel/sched/cpudeadline.c34
-rw-r--r--kernel/sched/cpudeadline.h4
-rw-r--r--kernel/sched/cputime.c20
-rw-r--r--kernel/sched/deadline.c339
-rw-r--r--kernel/sched/debug.c8
-rw-r--r--kernel/sched/ext.c285
-rw-r--r--kernel/sched/fair.c653
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/idle.c41
-rw-r--r--kernel/sched/isolation.c23
-rw-r--r--kernel/sched/membarrier.c8
-rw-r--r--kernel/sched/rt.c13
-rw-r--r--kernel/sched/sched.h666
-rw-r--r--kernel/sched/stats.h2
-rw-r--r--kernel/sched/stop_task.c13
-rw-r--r--kernel/sched/syscalls.c100
-rw-r--r--kernel/sched/topology.c114
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/task_work.c8
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/namespace.c5
-rw-r--r--kernel/time/posix-cpu-timers.c4
-rw-r--r--kernel/time/posix-timers.c14
-rw-r--r--kernel/time/tick-oneshot.c20
-rw-r--r--kernel/time/tick-sched.c41
-rw-r--r--kernel/time/timekeeping.c27
-rw-r--r--kernel/time/timer.c9
-rw-r--r--kernel/time/timer_migration.c487
-rw-r--r--kernel/time/timer_migration.h2
-rw-r--r--kernel/trace/ftrace.c60
-rw-r--r--kernel/trace/ring_buffer.c4
-rw-r--r--kernel/trace/rv/monitors/pagefault/Kconfig1
-rw-r--r--kernel/trace/rv/rv.c12
-rw-r--r--kernel/trace/trace.c10
-rw-r--r--kernel/trace/trace_events_hist.c6
-rw-r--r--kernel/trace/trace_events_user.c22
-rw-r--r--kernel/trace/trace_fprobe.c7
-rw-r--r--kernel/unwind/deferred.c44
-rw-r--r--kernel/unwind/user.c59
-rw-r--r--kernel/user.c7
-rw-r--r--kernel/watch_queue.c4
128 files changed, 5523 insertions, 2983 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 422270d64820..54e581072617 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -109,6 +109,15 @@ config KEXEC_HANDOVER
to keep data or state alive across the kexec. For this to work,
both source and target kernels need to have this option enabled.
+config KEXEC_HANDOVER_DEBUG
+ bool "Enable Kexec Handover debug checks"
+ depends on KEXEC_HANDOVER
+ help
+ This option enables extra sanity checks for the Kexec Handover
+ subsystem. Since, KHO performance is crucial in live update
+ scenarios and the extra code might be adding overhead it is
+ only optionally enabled.
+
config CRASH_DUMP
bool "kernel crash dumps"
default ARCH_DEFAULT_CRASH_DUMP
diff --git a/kernel/Makefile b/kernel/Makefile
index df3dd8291bb6..9fe722305c9b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -83,6 +83,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
+obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup/
diff --git a/kernel/acct.c b/kernel/acct.c
index 61630110e29d..2a2b3c874acd 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -520,26 +520,23 @@ static void fill_ac(struct bsd_acct_struct *acct)
static void acct_write_process(struct bsd_acct_struct *acct)
{
struct file *file = acct->file;
- const struct cred *cred;
acct_t *ac = &acct->ac;
/* Perform file operations on behalf of whoever enabled accounting */
- cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system. Then get freeze protection. If
- * the fs is frozen, just skip the write as we could deadlock
- * the system otherwise.
- */
- if (check_free_space(acct) && file_start_write_trylock(file)) {
- /* it's been opened O_APPEND, so position is irrelevant */
- loff_t pos = 0;
- __kernel_write(file, ac, sizeof(acct_t), &pos);
- file_end_write(file);
+ scoped_with_creds(file->f_cred) {
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system. Then get freeze protection. If
+ * the fs is frozen, just skip the write as we could deadlock
+ * the system otherwise.
+ */
+ if (check_free_space(acct) && file_start_write_trylock(file)) {
+ /* it's been opened O_APPEND, so position is irrelevant */
+ loff_t pos = 0;
+ __kernel_write(file, ac, sizeof(acct_t), &pos);
+ file_end_write(file);
+ }
}
-
- revert_creds(cred);
}
static void do_acct_process(struct bsd_acct_struct *acct)
diff --git a/kernel/audit.h b/kernel/audit.h
index 0f05933a173b..7c401729e21b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -138,7 +138,7 @@ struct audit_context {
struct audit_aux_data *aux_pids;
struct sockaddr_storage *sockaddr;
size_t sockaddr_len;
- /* Save things to print about task_struct */
+ /* Save things to print about task_struct */
pid_t ppid;
kuid_t uid, euid, suid, fsuid;
kgid_t gid, egid, sgid, fsgid;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index c401082d9b25..6a86c0683b67 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -638,10 +638,9 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
void *bufp;
int i;
- data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
+ data = kzalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
- memset(data, 0, sizeof(*data));
data->flags = krule->flags | krule->listnr;
data->action = krule->action;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d1966144bdfe..dd0563a8e0be 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2416,41 +2416,36 @@ void __audit_inode_child(struct inode *parent,
if (inode)
handle_one(inode);
- /* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name ||
- (n->type != AUDIT_TYPE_PARENT &&
- n->type != AUDIT_TYPE_UNKNOWN))
+ /* can only match entries that have a name */
+ if (!n->name)
continue;
- if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
- !audit_compare_dname_path(dname,
- n->name->name, n->name_len)) {
- if (n->type == AUDIT_TYPE_UNKNOWN)
- n->type = AUDIT_TYPE_PARENT;
+ /* look for a parent entry first */
+ if (!found_parent &&
+ (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) &&
+ (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
+ !audit_compare_dname_path(dname, n->name->name, n->name_len))) {
+ n->type = AUDIT_TYPE_PARENT;
found_parent = n;
- break;
- }
- }
-
- cond_resched();
-
- /* is there a matching child entry? */
- list_for_each_entry(n, &context->names_list, list) {
- /* can only match entries that have a name */
- if (!n->name ||
- (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
+ if (found_child)
+ break;
continue;
+ }
- if (!strcmp(dname->name, n->name->name) ||
- !audit_compare_dname_path(dname, n->name->name,
+ /* is there a matching child entry? */
+ if (!found_child &&
+ (n->type == type || n->type == AUDIT_TYPE_UNKNOWN) &&
+ (!strcmp(dname->name, n->name->name) ||
+ !audit_compare_dname_path(dname, n->name->name,
found_parent ?
found_parent->name_len :
- AUDIT_NAME_FULL)) {
+ AUDIT_NAME_FULL))) {
if (n->type == AUDIT_TYPE_UNKNOWN)
n->type = type;
found_child = n;
- break;
+ if (found_parent)
+ break;
}
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 6ac35430c573..eec60b57bd3d 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -634,37 +634,24 @@ release_prog:
int bpf_iter_new_fd(struct bpf_link *link)
{
struct bpf_iter_link *iter_link;
- struct file *file;
unsigned int flags;
- int err, fd;
+ int err;
if (link->ops != &bpf_iter_link_lops)
return -EINVAL;
flags = O_RDONLY | O_CLOEXEC;
- fd = get_unused_fd_flags(flags);
- if (fd < 0)
- return fd;
-
- file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
- goto free_fd;
- }
+
+ FD_PREPARE(fdf, flags, anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags));
+ if (fdf.err)
+ return fdf.err;
iter_link = container_of(link, struct bpf_iter_link, link);
- err = prepare_seq_file(file, iter_link);
+ err = prepare_seq_file(fd_prepare_file(fdf), iter_link);
if (err)
- goto free_file;
+ return err; /* Automatic cleanup handles fput */
- fd_install(fd, file);
- return fd;
-
-free_file:
- fput(file);
-free_fd:
- put_unused_fd(fd);
- return err;
+ return fd_publish(fdf);
}
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c9fab9a356df..e4007fea4909 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1215,13 +1215,20 @@ static void bpf_wq_work(struct work_struct *work)
rcu_read_unlock_trace();
}
+static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
+{
+ struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
+
+ kfree_nolock(cb);
+}
+
static void bpf_wq_delete_work(struct work_struct *work)
{
struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
cancel_work_sync(&w->work);
- kfree_rcu(w, cb.rcu);
+ call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free);
}
static void bpf_timer_delete_work(struct work_struct *work)
@@ -1230,13 +1237,13 @@ static void bpf_timer_delete_work(struct work_struct *work)
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call
- * kfree_rcu(t) right after for both preallocated and non-preallocated
+ * call_rcu() right after for both preallocated and non-preallocated
* maps. The async->cb = NULL was already done and no code path can see
* address 't' anymore. Timer if armed for existing bpf_hrtimer before
* bpf_timer_cancel_and_free will have been cancelled.
*/
hrtimer_cancel(&t->timer);
- kfree_rcu(t, cb.rcu);
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
}
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
@@ -1270,11 +1277,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
goto out;
}
- /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until
- * kmalloc_nolock() is available, avoid locking issues by using
- * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM).
- */
- cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
+ cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
@@ -1315,7 +1318,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
* or pinned in bpffs.
*/
WRITE_ONCE(async->cb, NULL);
- kfree(cb);
+ kfree_nolock(cb);
ret = -EPERM;
}
out:
@@ -1580,7 +1583,7 @@ void bpf_timer_cancel_and_free(void *val)
* timer _before_ calling us, such that failing to cancel it here will
* cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
* Therefore, we _need_ to cancel any outstanding timers before we do
- * kfree_rcu, even though no more timers can be armed.
+ * call_rcu, even though no more timers can be armed.
*
* Moreover, we need to schedule work even if timer does not belong to
* the calling callback_fn, as on two different CPUs, we can end up in a
@@ -1607,7 +1610,7 @@ void bpf_timer_cancel_and_free(void *val)
* completion.
*/
if (hrtimer_try_to_cancel(&t->timer) >= 0)
- kfree_rcu(t, cb.rcu);
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
else
queue_work(system_dfl_wq, &t->cb.delete_work);
} else {
@@ -4166,7 +4169,8 @@ release_prog:
}
/**
- * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
+ * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL
+ * mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
@@ -4175,15 +4179,17 @@ release_prog:
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
-__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
- void *map__map, bpf_task_work_callback_t callback,
- void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task,
+ struct bpf_task_work *tw, void *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
{
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
}
/**
- * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode
+ * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME
+ * mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
@@ -4192,9 +4198,10 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
-__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
- void *map__map, bpf_task_work_callback_t callback,
- void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task,
+ struct bpf_task_work *tw, void *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
{
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
}
@@ -4342,6 +4349,7 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_local_irq_save)
BTF_ID_FLAGS(func, bpf_local_irq_restore)
+#ifdef CONFIG_BPF_EVENTS
BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
@@ -4350,6 +4358,7 @@ BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+#endif
#ifdef CONFIG_DMA_SHARED_BUFFER
BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
@@ -4371,9 +4380,9 @@ BTF_ID_FLAGS(func, bpf_strnstr);
#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
-BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index 3c611aba7f52..1e6538f59a78 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -195,8 +195,10 @@ static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
return ERR_PTR(-ENOMEM);
result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set),
GFP_KERNEL_ACCOUNT);
- if (!result->must_write_set)
+ if (!result->must_write_set) {
+ kvfree(result);
return ERR_PTR(-ENOMEM);
+ }
memcpy(&result->callchain, callchain, sizeof(*callchain));
result->insn_cnt = subprog_sz;
hash_add(liveness->func_instances, &result->hl_node, key);
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 719d73299397..d706c4b7f532 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -216,6 +216,8 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
{
+ irq_work_sync(&rb->work);
+
/* copy pages pointer and nr_pages to local variable, as we are going
* to unmap rb itself with vunmap() below
*/
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4d53cdd1374c..8f1dacaf01fe 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
max_depth = sysctl_perf_event_max_stack;
trace = get_perf_callchain(regs, kernel, user, max_depth,
- false, false);
+ false, false, 0);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
trace = get_callchain_entry_for_task(task, max_depth);
else
trace = get_perf_callchain(regs, kernel, user, max_depth,
- crosstask, false);
+ crosstask, false, 0);
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
index eb6c5a21c2ef..ff16c631951b 100644
--- a/kernel/bpf/stream.c
+++ b/kernel/bpf/stream.c
@@ -355,7 +355,8 @@ __bpf_kfunc_start_defs();
* Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
* enum in headers.
*/
-__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog)
+__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
+ u32 len__sz, void *aux__prog)
{
struct bpf_bprintf_data data = {
.get_bin_args = true,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2a9456a3e730..6cde6a46babf 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -520,6 +520,21 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
return ptr;
}
+void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
+ int node)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
struct mem_cgroup *memcg, *old_memcg;
@@ -2315,7 +2330,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
return;
if (audit_enabled == AUDIT_OFF)
return;
- if (!in_irq() && !irqs_disabled())
+ if (!in_hardirq() && !irqs_disabled())
ctx = audit_context();
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
if (unlikely(!ab))
@@ -2413,7 +2428,7 @@ static void __bpf_prog_put(struct bpf_prog *prog)
struct bpf_prog_aux *aux = prog->aux;
if (atomic64_dec_and_test(&aux->refcnt)) {
- if (in_irq() || irqs_disabled()) {
+ if (in_hardirq() || irqs_disabled()) {
INIT_WORK(&aux->work, bpf_prog_put_deferred);
schedule_work(&aux->work);
} else {
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index 0bbe412f854e..feecd8f4dbf9 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -110,16 +110,15 @@ const struct file_operations bpf_token_fops = {
int bpf_token_create(union bpf_attr *attr)
{
+ struct bpf_token *token __free(kfree) = NULL;
struct bpf_mount_opts *mnt_opts;
- struct bpf_token *token = NULL;
struct user_namespace *userns;
struct inode *inode;
- struct file *file;
CLASS(fd, f)(attr->token_create.bpffs_fd);
struct path path;
struct super_block *sb;
umode_t mode;
- int err, fd;
+ int err;
if (fd_empty(f))
return -EBADF;
@@ -166,23 +165,20 @@ int bpf_token_create(union bpf_attr *attr)
inode->i_fop = &bpf_token_fops;
clear_nlink(inode); /* make sure it is unlinked */
- file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
- if (IS_ERR(file)) {
- iput(inode);
- return PTR_ERR(file);
- }
+ FD_PREPARE(fdf, O_CLOEXEC,
+ alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME,
+ O_RDWR, &bpf_token_fops));
+ if (fdf.err)
+ return fdf.err;
token = kzalloc(sizeof(*token), GFP_USER);
- if (!token) {
- err = -ENOMEM;
- goto out_file;
- }
+ if (!token)
+ return -ENOMEM;
atomic64_set(&token->refcnt, 1);
- /* remember bpffs owning userns for future ns_capable() checks */
- token->userns = get_user_ns(userns);
-
+ /* remember bpffs owning userns for future ns_capable() checks. */
+ token->userns = userns;
token->allowed_cmds = mnt_opts->delegate_cmds;
token->allowed_maps = mnt_opts->delegate_maps;
token->allowed_progs = mnt_opts->delegate_progs;
@@ -190,24 +186,11 @@ int bpf_token_create(union bpf_attr *attr)
err = security_bpf_token_create(token, attr, &path);
if (err)
- goto out_token;
-
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0) {
- err = fd;
- goto out_token;
- }
-
- file->private_data = token;
- fd_install(fd, file);
-
- return fd;
+ return err;
-out_token:
- bpf_token_free(token);
-out_file:
- fput(file);
- return err;
+ get_user_ns(token->userns);
+ fd_prepare_file(fdf)->private_data = no_free_ptr(token);
+ return fd_publish(fdf);
}
int bpf_token_get_info_by_fd(struct bpf_token *token,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 5949095e51c3..f2cb0b097093 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -479,11 +479,6 @@ again:
* BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
* trampoline again, and retry register.
*/
- /* reset fops->func and fops->trampoline for re-register */
- tr->fops->func = NULL;
- tr->fops->trampoline = 0;
-
- /* free im memory and reallocate later */
bpf_tramp_image_free(im);
goto again;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ff40e5e65c43..fbe4bb91c564 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8866,7 +8866,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur)
{
struct bpf_func_state *fold, *fcur;
- int i, fr;
+ int i, fr, num_slots;
reset_idmap_scratch(env);
for (fr = old->curframe; fr >= 0; fr--) {
@@ -8879,7 +8879,9 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
&fcur->regs[i],
&env->idmap_scratch);
- for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
+ num_slots = min(fold->allocated_stack / BPF_REG_SIZE,
+ fcur->allocated_stack / BPF_REG_SIZE);
+ for (i = 0; i < num_slots; i++) {
if (!is_spilled_reg(&fold->stack[i]) ||
!is_spilled_reg(&fcur->stack[i]))
continue;
@@ -12259,8 +12261,8 @@ enum special_kfunc_type {
KF_bpf_res_spin_lock_irqsave,
KF_bpf_res_spin_unlock_irqrestore,
KF___bpf_trap,
- KF_bpf_task_work_schedule_signal,
- KF_bpf_task_work_schedule_resume,
+ KF_bpf_task_work_schedule_signal_impl,
+ KF_bpf_task_work_schedule_resume_impl,
};
BTF_ID_LIST(special_kfunc_list)
@@ -12331,13 +12333,13 @@ BTF_ID(func, bpf_res_spin_unlock)
BTF_ID(func, bpf_res_spin_lock_irqsave)
BTF_ID(func, bpf_res_spin_unlock_irqrestore)
BTF_ID(func, __bpf_trap)
-BTF_ID(func, bpf_task_work_schedule_signal)
-BTF_ID(func, bpf_task_work_schedule_resume)
+BTF_ID(func, bpf_task_work_schedule_signal_impl)
+BTF_ID(func, bpf_task_work_schedule_resume_impl)
static bool is_task_work_add_kfunc(u32 func_id)
{
- return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
- func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume];
+ return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] ||
+ func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl];
}
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6ae5f48cf64e..ae1eb7a85eb4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -250,12 +250,9 @@ bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .ns.__ns_ref = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_cgroup_ns),
.user_ns = &init_user_ns,
- .ns.ops = &cgroupns_operations,
- .ns.inum = ns_init_inum(&init_cgroup_ns),
.root_cset = &init_css_set,
- .ns.ns_type = ns_common_type(&init_cgroup_ns),
};
static struct file_system_type cgroup2_fs_type;
@@ -1522,9 +1519,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
} else {
/*
* NOTE: This function may be called from bpf_cgroup_from_id()
- * on a task which has already passed exit_task_namespaces() and
- * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
- * cgroups visible for lookups.
+ * on a task which has already passed exit_nsproxy_namespaces()
+ * and nsproxy == NULL. Fall back to cgrp_dfl_root which will
+ * make all cgroups visible for lookups.
*/
return &cgrp_dfl_root.cgrp;
}
@@ -5363,7 +5360,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
- const struct cred *saved_cred;
ssize_t ret;
enum cgroup_attach_lock_mode lock_mode;
@@ -5386,11 +5382,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* permissions using the credentials from file open to protect against
* inherited fd attacks.
*/
- saved_cred = override_creds(of->file->f_cred);
- ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb,
- threadgroup, ctx->ns);
- revert_creds(saved_cred);
+ scoped_with_creds(of->file->f_cred)
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
if (ret)
goto out_finish;
@@ -5892,7 +5887,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
* if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
- seqcount_init(&cgrp->freezer.freeze_seq);
+ seqcount_spinlock_init(&cgrp->freezer.freeze_seq, &css_set_lock);
if (cgrp->freezer.e_freeze) {
/*
* Set the CGRP_FREEZE flag, so when a process will be
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 52468d2c178a..4aaad07b0bd1 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1391,7 +1391,7 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
return isolcpus_updated;
}
-static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
+static void update_isolation_cpumasks(bool isolcpus_updated)
{
int ret;
@@ -1402,6 +1402,9 @@ static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
WARN_ON_ONCE(ret < 0);
+
+ ret = tmigr_isolated_exclude_cpumask(isolated_cpus);
+ WARN_ON_ONCE(ret < 0);
}
/**
@@ -1555,7 +1558,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
list_add(&cs->remote_sibling, &remote_children);
cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
spin_unlock_irq(&callback_lock);
- update_unbound_workqueue_cpumask(isolcpus_updated);
+ update_isolation_cpumasks(isolcpus_updated);
cpuset_force_rebuild();
cs->prs_err = 0;
@@ -1596,7 +1599,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
compute_excpus(cs, cs->effective_xcpus);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
- update_unbound_workqueue_cpumask(isolcpus_updated);
+ update_isolation_cpumasks(isolcpus_updated);
cpuset_force_rebuild();
/*
@@ -1665,7 +1668,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
if (xcpus)
cpumask_copy(cs->exclusive_cpus, xcpus);
spin_unlock_irq(&callback_lock);
- update_unbound_workqueue_cpumask(isolcpus_updated);
+ update_isolation_cpumasks(isolcpus_updated);
if (adding || deleting)
cpuset_force_rebuild();
@@ -2023,7 +2026,7 @@ write_error:
WARN_ON_ONCE(parent->nr_subparts < 0);
}
spin_unlock_irq(&callback_lock);
- update_unbound_workqueue_cpumask(isolcpus_updated);
+ update_isolation_cpumasks(isolcpus_updated);
if ((old_prs != new_prs) && (cmd == partcmd_update))
update_partition_exclusive_flag(cs, new_prs);
@@ -3043,7 +3046,7 @@ out:
else if (isolcpus_updated)
isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
- update_unbound_workqueue_cpumask(isolcpus_updated);
+ update_isolation_cpumasks(isolcpus_updated);
/* Force update if switching back to member & update effective_xcpus */
update_cpumasks_hier(cs, &tmpmask, !new_prs);
@@ -4180,7 +4183,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
rcu_read_lock();
cs_mask = task_cs(tsk)->cpus_allowed;
if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
- do_set_cpus_allowed(tsk, cs_mask);
+ set_cpus_allowed_force(tsk, cs_mask);
changed = true;
}
rcu_read_unlock();
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index dd9417425d92..915b02f65980 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -63,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer)
return css_freezer(freezer->css.parent);
}
-bool cgroup_freezing(struct task_struct *task)
+bool cgroup1_freezing(struct task_struct *task)
{
bool ret;
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index fdbe57578e68..db9617556dd7 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -30,7 +30,6 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
ret = ns_common_init(new_ns);
if (ret)
return ERR_PTR(ret);
- ns_tree_add(new_ns);
return no_free_ptr(new_ns);
}
@@ -86,6 +85,7 @@ struct cgroup_namespace *copy_cgroup_ns(u64 flags,
new_ns->ucounts = ucounts;
new_ns->root_cset = cset;
+ ns_tree_add(new_ns);
return new_ns;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index db9f6c539b28..b674fdf96208 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3085,10 +3085,13 @@ EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
struct cpumask __cpu_possible_mask __ro_after_init
= {CPU_BITS_ALL};
+unsigned int __num_possible_cpus __ro_after_init = NR_CPUS;
#else
struct cpumask __cpu_possible_mask __ro_after_init;
+unsigned int __num_possible_cpus __ro_after_init;
#endif
EXPORT_SYMBOL(__cpu_possible_mask);
+EXPORT_SYMBOL(__num_possible_cpus);
struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);
@@ -3116,6 +3119,7 @@ void init_cpu_present(const struct cpumask *src)
void init_cpu_possible(const struct cpumask *src)
{
cpumask_copy(&__cpu_possible_mask, src);
+ __num_possible_cpus = cpumask_weight(&__cpu_possible_mask);
}
void set_cpu_online(unsigned int cpu, bool online)
@@ -3140,6 +3144,21 @@ void set_cpu_online(unsigned int cpu, bool online)
}
/*
+ * This should be marked __init, but there is a boatload of call sites
+ * which need to be fixed up to do so. Sigh...
+ */
+void set_cpu_possible(unsigned int cpu, bool possible)
+{
+ if (possible) {
+ if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask))
+ __num_possible_cpus++;
+ } else {
+ if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask))
+ __num_possible_cpus--;
+ }
+}
+
+/*
* Activate the first processor.
*/
void __init boot_cpu_init(void)
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 3b1c43382eec..99dac1aa972a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -373,7 +373,7 @@ static int __crash_shrink_memory(struct resource *old_res,
old_res->start = 0;
old_res->end = 0;
} else {
- crashk_res.end = ram_res->start - 1;
+ old_res->end = ram_res->start - 1;
}
crash_free_reserved_phys_range(ram_res->start, ram_res->end);
diff --git a/kernel/cred.c b/kernel/cred.c
index dbf6b687dc5c..a6f686b30da1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,33 +35,6 @@ do { \
static struct kmem_cache *cred_jar;
-/* init to 2 - one for init_task, one to ensure it is never freed */
-static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
-
-/*
- * The initial credentials for the initial task
- */
-struct cred init_cred = {
- .usage = ATOMIC_INIT(4),
- .uid = GLOBAL_ROOT_UID,
- .gid = GLOBAL_ROOT_GID,
- .suid = GLOBAL_ROOT_UID,
- .sgid = GLOBAL_ROOT_GID,
- .euid = GLOBAL_ROOT_UID,
- .egid = GLOBAL_ROOT_GID,
- .fsuid = GLOBAL_ROOT_UID,
- .fsgid = GLOBAL_ROOT_GID,
- .securebits = SECUREBITS_DEFAULT,
- .cap_inheritable = CAP_EMPTY_SET,
- .cap_permitted = CAP_FULL_SET,
- .cap_effective = CAP_FULL_SET,
- .cap_bset = CAP_FULL_SET,
- .user = INIT_USER,
- .user_ns = &init_user_ns,
- .group_info = &init_groups,
- .ucounts = &init_ucounts,
-};
-
/*
* The RCU callback to actually dispose of a set of credentials
*/
@@ -306,6 +279,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags)
kdebug("share_creds(%p{%ld})",
p->cred, atomic_long_read(&p->cred->usage));
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ get_cred_namespaces(p);
return 0;
}
@@ -343,6 +317,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags)
p->cred = p->real_cred = get_cred(new);
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ get_cred_namespaces(p);
+
return 0;
error_put:
@@ -435,10 +411,13 @@ int commit_creds(struct cred *new)
*/
if (new->user != old->user || new->user_ns != old->user_ns)
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user || new->user_ns != old->user_ns)
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+ if (new->user_ns != old->user_ns)
+ switch_cred_namespaces(old, new);
/* send notifications */
if (!uid_eq(new->uid, old->uid) ||
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 1e5c64cb6a42..138ede653de4 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -23,6 +23,7 @@
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/swiotlb.h>
#include <asm/sections.h>
#include "debug.h"
@@ -594,7 +595,9 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
if (rc == -ENOMEM) {
pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
- } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+ } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
+ is_swiotlb_active(entry->dev))) {
err_printk(entry->dev, entry,
"cacheline tracking EEXIST, overlapping mappings aren't supported\n");
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 1f9ee9759426..f973e7e73c90 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -481,6 +481,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
case PCI_P2PDMA_MAP_BUS_ADDR:
sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
sg_phys(sg));
+ sg_dma_len(sg) = sg->length;
sg_dma_mark_bus_address(sg);
continue;
default:
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f62e1d1b2063..5c792b30c58a 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -11,19 +11,20 @@
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
-/**
- * exit_to_user_mode_loop - do any pending work before leaving to user space
- * @regs: Pointer to pt_regs on entry stack
- * @ti_work: TIF work flags as read by the caller
- */
-__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work)
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ)
+#else
+#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK)
+#endif
+
+static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
{
/*
* Before returning to user space ensure that all pending work
* items have been completed.
*/
- while (ti_work & EXIT_TO_USER_MODE_WORK) {
+ while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
local_irq_enable_exit_to_user(ti_work);
@@ -62,17 +63,21 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
return ti_work;
}
-noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs: Pointer to pt_regs on entry stack
+ * @ti_work: TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
{
- enter_from_user_mode(regs);
-}
+ for (;;) {
+ ti_work = __exit_to_user_mode_loop(regs, ti_work);
-noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
-{
- instrumentation_begin();
- exit_to_user_mode_prepare(regs);
- instrumentation_end();
- exit_to_user_mode();
+ if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work)))
+ return ti_work;
+ ti_work = read_thread_flags();
+ }
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
index 66e6ba7fa80c..940a597ded40 100644
--- a/kernel/entry/syscall-common.c
+++ b/kernel/entry/syscall-common.c
@@ -63,14 +63,6 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
return ret ? : syscall;
}
-noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
-{
- enter_from_user_mode(regs);
- instrumentation_begin();
- local_irq_enable();
- instrumentation_end();
-}
-
/*
* If SYSCALL_EMU is set, then the only reason to report is when
* SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 808c0d7a31fa..b9c7e00725d6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool crosstask, bool add_mark)
+ u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
@@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
regs = task_pt_regs(current);
}
+ if (defer_cookie) {
+ /*
+ * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+ * which can be stitched to this one, and add
+ * the cookie after it (it will be cut off when the
+ * user stack is copied to the callchain).
+ */
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+ perf_callchain_store_context(&ctx, defer_cookie);
+ goto exit_put;
+ }
+
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7541f6f85fcb..ece716879cbc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -56,6 +56,7 @@
#include <linux/buildid.h>
#include <linux/task_work.h>
#include <linux/percpu-rwsem.h>
+#include <linux/unwind_deferred.h>
#include "internal.h"
@@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr)
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
+static struct unwind_work perf_unwind_work;
+
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
@@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
!(current->flags & (PF_KTHREAD | PF_USER_WORKER));
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
+ bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+ event->attr.defer_callchain;
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ u64 defer_cookie;
if (!current->mm)
user = false;
@@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (!kernel && !user)
return &__empty_callchain;
- callchain = get_perf_callchain(regs, kernel, user,
- max_stack, crosstask, true);
+ if (!(user && defer_user && !crosstask &&
+ unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
+ defer_cookie = 0;
+
+ callchain = get_perf_callchain(regs, kernel, user, max_stack,
+ crosstask, true, defer_cookie);
+
return callchain ?: &__empty_callchain;
}
@@ -9403,7 +9414,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
flags |= MAP_HUGETLB;
if (file) {
- struct inode *inode;
+ const struct inode *inode;
dev_t dev;
buf = kmalloc(PATH_MAX, GFP_KERNEL);
@@ -9416,12 +9427,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- name = file_path(file, buf, PATH_MAX - sizeof(u64));
+ name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
name = "//toolong";
goto cpy_name;
}
- inode = file_inode(vma->vm_file);
+ inode = file_user_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
gen = inode->i_generation;
@@ -9492,7 +9503,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
if (!filter->path.dentry)
return false;
- if (d_inode(filter->path.dentry) != file_inode(file))
+ if (d_inode(filter->path.dentry) != file_user_inode(file))
return false;
if (filter->offset > offset + size)
@@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog,
perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}
+struct perf_callchain_deferred_event {
+ struct unwind_stacktrace *trace;
+ struct {
+ struct perf_event_header header;
+ u64 cookie;
+ u64 nr;
+ u64 ips[];
+ } event;
+};
+
+static void perf_callchain_deferred_output(struct perf_event *event, void *data)
+{
+ struct perf_callchain_deferred_event *deferred_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret, size = deferred_event->event.header.size;
+
+ if (!event->attr.defer_output)
+ return;
+
+ /* XXX do we really need sample_id_all for this ??? */
+ perf_event_header__init_id(&deferred_event->event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, &sample, event,
+ deferred_event->event.header.size);
+ if (ret)
+ goto out;
+
+ perf_output_put(&handle, deferred_event->event);
+ for (int i = 0; i < deferred_event->trace->nr; i++) {
+ u64 entry = deferred_event->trace->entries[i];
+ perf_output_put(&handle, entry);
+ }
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ deferred_event->event.header.size = size;
+}
+
+static void perf_unwind_deferred_callback(struct unwind_work *work,
+ struct unwind_stacktrace *trace, u64 cookie)
+{
+ struct perf_callchain_deferred_event deferred_event = {
+ .trace = trace,
+ .event = {
+ .header = {
+ .type = PERF_RECORD_CALLCHAIN_DEFERRED,
+ .misc = PERF_RECORD_MISC_USER,
+ .size = sizeof(deferred_event.event) +
+ (trace->nr * sizeof(u64)),
+ },
+ .cookie = cookie,
+ .nr = trace->nr,
+ },
+ };
+
+ perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
+}
+
struct perf_text_poke_event {
const void *old_bytes;
const void *new_bytes;
@@ -11773,7 +11844,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
event = container_of(hrtimer, struct perf_event, hw.hrtimer);
- if (event->state != PERF_EVENT_STATE_ACTIVE)
+ if (event->state != PERF_EVENT_STATE_ACTIVE ||
+ event->hw.state & PERF_HES_STOPPED)
return HRTIMER_NORESTART;
event->pmu->read(event);
@@ -11819,15 +11891,20 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
/*
- * The throttle can be triggered in the hrtimer handler.
- * The HRTIMER_NORESTART should be used to stop the timer,
- * rather than hrtimer_cancel(). See perf_swevent_hrtimer()
+ * Careful: this function can be triggered in the hrtimer handler,
+ * for cpu-clock events, so hrtimer_cancel() would cause a
+ * deadlock.
+ *
+ * So use hrtimer_try_to_cancel() to try to stop the hrtimer,
+ * and the cpu-clock handler also sets the PERF_HES_STOPPED flag,
+ * which guarantees that perf_swevent_hrtimer() will stop the
+ * hrtimer once it sees the PERF_HES_STOPPED flag.
*/
if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
- hrtimer_cancel(&hwc->hrtimer);
+ hrtimer_try_to_cancel(&hwc->hrtimer);
}
}
@@ -11871,12 +11948,14 @@ static void cpu_clock_event_update(struct perf_event *event)
static void cpu_clock_event_start(struct perf_event *event, int flags)
{
+ event->hw.state = 0;
local64_set(&event->hw.prev_count, local_clock());
perf_swevent_start_hrtimer(event);
}
static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
+ event->hw.state = PERF_HES_STOPPED;
perf_swevent_cancel_hrtimer(event);
if (flags & PERF_EF_UPDATE)
cpu_clock_event_update(event);
@@ -11893,7 +11972,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
static void cpu_clock_event_del(struct perf_event *event, int flags)
{
- cpu_clock_event_stop(event, flags);
+ cpu_clock_event_stop(event, PERF_EF_UPDATE);
}
static void cpu_clock_event_read(struct perf_event *event)
@@ -11950,12 +12029,14 @@ static void task_clock_event_update(struct perf_event *event, u64 now)
static void task_clock_event_start(struct perf_event *event, int flags)
{
+ event->hw.state = 0;
local64_set(&event->hw.prev_count, event->ctx->time);
perf_swevent_start_hrtimer(event);
}
static void task_clock_event_stop(struct perf_event *event, int flags)
{
+ event->hw.state = PERF_HES_STOPPED;
perf_swevent_cancel_hrtimer(event);
if (flags & PERF_EF_UPDATE)
task_clock_event_update(event, event->ctx->time);
@@ -14799,6 +14880,9 @@ void __init perf_event_init(void)
idr_init(&pmu_idr);
+ unwind_deferred_init(&perf_unwind_work,
+ perf_unwind_deferred_callback);
+
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8709c69118b5..f11ceb8be8c4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2765,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
+ /* Try to optimize after first hit. */
+ arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
/*
* If user decided to take execution elsewhere, it makes little sense
* to execute the original instruction, so let's skip it.
@@ -2772,9 +2775,6 @@ static void handle_swbp(struct pt_regs *regs)
if (instruction_pointer(regs) != bp_vaddr)
goto out;
- /* Try to optimize after first hit. */
- arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
-
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index 9f74e8f1c431..b9667ffcf7b3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -291,6 +291,7 @@ repeat:
write_unlock_irq(&tasklist_lock);
/* @thread_pid can't go away until free_pids() below */
proc_flush_pid(thread_pid);
+ exit_cred_namespaces(p);
add_device_randomness(&p->se.sum_exec_runtime,
sizeof(p->se.sum_exec_runtime));
free_pids(post.pids);
@@ -910,6 +911,7 @@ void __noreturn do_exit(long code)
user_events_exit(tsk);
io_uring_files_cancel();
+ sched_mm_cid_exit(tsk);
exit_signals(tsk); /* sets PF_EXITING */
seccomp_filter_release(tsk);
@@ -939,7 +941,6 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
- unwind_deferred_task_exit(tsk);
trace_sched_process_exit(tsk, group_dead);
/*
@@ -950,6 +951,12 @@ void __noreturn do_exit(long code)
* gets woken up by child-exit notifications.
*/
perf_event_exit_task(tsk);
+ /*
+ * PF_EXITING (above) ensures unwind_deferred_request() will no
+ * longer add new unwinds. While exit_mm() (below) will destroy the
+ * abaility to do unwinds. So flush any pending unwinds here.
+ */
+ unwind_deferred_task_exit(tsk);
exit_mm();
@@ -962,7 +969,7 @@ void __noreturn do_exit(long code)
exit_fs(tsk);
if (group_dead)
disassociate_ctty(1);
- exit_task_namespaces(tsk);
+ exit_nsproxy_namespaces(tsk);
exit_task_work(tsk);
exit_thread(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..83e05d6f2307 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -955,10 +955,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
#ifdef CONFIG_SCHED_MM_CID
- tsk->mm_cid = -1;
- tsk->last_mm_cid = -1;
- tsk->mm_cid_active = 0;
- tsk->migrate_from_cpu = -1;
+ tsk->mm_cid.cid = MM_CID_UNSET;
+ tsk->mm_cid.active = 0;
#endif
return tsk;
@@ -2453,9 +2451,10 @@ bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
- exit_task_namespaces(p);
+ exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
+ sched_mm_cid_exit(p);
mm_clear_owner(p->mm, p);
mmput(p->mm);
}
@@ -2487,6 +2486,7 @@ bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ exit_cred_namespaces(p);
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index ddc11a8bd2ea..a76bf957fb32 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,7 +44,7 @@ bool freezing_slow_path(struct task_struct *p)
if (tsk_is_oom_victim(p))
return false;
- if (pm_nosig_freezing || cgroup_freezing(p))
+ if (pm_nosig_freezing || cgroup1_freezing(p))
return true;
if (pm_freezing && !(p->flags & PF_KTHREAD))
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 125804fbb5cb..cf7e610eac42 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -581,7 +581,7 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
if (flags & FLAGS_NUMA) {
u32 __user *naddr = (void *)uaddr + size / 2;
- if (futex_get_value(&node, naddr))
+ if (get_user_inline(node, naddr))
return -EFAULT;
if ((node != FUTEX_NO_NODE) &&
@@ -601,7 +601,7 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
node = numa_node_id();
node_updated = true;
}
- if (node_updated && futex_put_value(node, naddr))
+ if (node_updated && put_user_inline(node, naddr))
return -EFAULT;
}
@@ -1680,10 +1680,10 @@ static bool futex_ref_get(struct futex_private_hash *fph)
{
struct mm_struct *mm = fph->mm;
- guard(rcu)();
+ guard(preempt)();
- if (smp_load_acquire(&fph->state) == FR_PERCPU) {
- this_cpu_inc(*mm->futex_ref);
+ if (READ_ONCE(fph->state) == FR_PERCPU) {
+ __this_cpu_inc(*mm->futex_ref);
return true;
}
@@ -1694,10 +1694,10 @@ static bool futex_ref_put(struct futex_private_hash *fph)
{
struct mm_struct *mm = fph->mm;
- guard(rcu)();
+ guard(preempt)();
- if (smp_load_acquire(&fph->state) == FR_PERCPU) {
- this_cpu_dec(*mm->futex_ref);
+ if (READ_ONCE(fph->state) == FR_PERCPU) {
+ __this_cpu_dec(*mm->futex_ref);
return false;
}
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 2cd57096c38e..30c2afa03889 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -281,63 +281,11 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32
return ret;
}
-/*
- * This does a plain atomic user space read, and the user pointer has
- * already been verified earlier by get_futex_key() to be both aligned
- * and actually in user space, just like futex_atomic_cmpxchg_inatomic().
- *
- * We still want to avoid any speculation, and while __get_user() is
- * the traditional model for this, it's actually slower than doing
- * this manually these days.
- *
- * We could just have a per-architecture special function for it,
- * the same way we do futex_atomic_cmpxchg_inatomic(), but rather
- * than force everybody to do that, write it out long-hand using
- * the low-level user-access infrastructure.
- *
- * This looks a bit overkill, but generally just results in a couple
- * of instructions.
- */
-static __always_inline int futex_get_value(u32 *dest, u32 __user *from)
-{
- u32 val;
-
- if (can_do_masked_user_access())
- from = masked_user_access_begin(from);
- else if (!user_read_access_begin(from, sizeof(*from)))
- return -EFAULT;
- unsafe_get_user(val, from, Efault);
- user_read_access_end();
- *dest = val;
- return 0;
-Efault:
- user_read_access_end();
- return -EFAULT;
-}
-
-static __always_inline int futex_put_value(u32 val, u32 __user *to)
-{
- if (can_do_masked_user_access())
- to = masked_user_access_begin(to);
- else if (!user_write_access_begin(to, sizeof(*to)))
- return -EFAULT;
- unsafe_put_user(val, to, Efault);
- user_write_access_end();
- return 0;
-Efault:
- user_write_access_end();
- return -EFAULT;
-}
-
+/* Read from user memory with pagefaults disabled */
static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
{
- int ret;
-
- pagefault_disable();
- ret = futex_get_value(dest, from);
- pagefault_enable();
-
- return ret;
+ guard(pagefault)();
+ return get_user_inline(*dest, from);
}
extern void __futex_unqueue(struct futex_q *q);
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index a08cc076f332..ffde93d051a4 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
#include <linux/mm.h>
#include "gcov.h"
-#if (__GNUC__ >= 14)
+#if (__GNUC__ >= 15)
+#define GCOV_COUNTERS 10
+#elif (__GNUC__ >= 14)
#define GCOV_COUNTERS 9
#elif (__GNUC__ >= 10)
#define GCOV_COUNTERS 8
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3ffa0d80ddd1..678f094d261a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -897,8 +897,9 @@ void handle_percpu_irq(struct irq_desc *desc)
void handle_percpu_devid_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- struct irqaction *action = desc->action;
unsigned int irq = irq_desc_get_irq(desc);
+ unsigned int cpu = smp_processor_id();
+ struct irqaction *action;
irqreturn_t res;
/*
@@ -910,12 +911,15 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
+ for (action = desc->action; action; action = action->next)
+ if (cpumask_test_cpu(cpu, action->affinity))
+ break;
+
if (likely(action)) {
trace_irq_handler_entry(irq, action);
res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
trace_irq_handler_exit(irq, action, res);
} else {
- unsigned int cpu = smp_processor_id();
bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
if (enabled)
@@ -929,31 +933,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
chip->irq_eoi(&desc->irq_data);
}
-/**
- * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu
- * dev ids
- * @desc: the interrupt description structure for this irq
- *
- * Similar to handle_fasteoi_nmi, but handling the dev_id cookie
- * as a percpu pointer.
- */
-void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
-{
- struct irq_chip *chip = irq_desc_get_chip(desc);
- struct irqaction *action = desc->action;
- unsigned int irq = irq_desc_get_irq(desc);
- irqreturn_t res;
-
- __kstat_incr_irqs_this_cpu(desc);
-
- trace_irq_handler_entry(irq, action);
- res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
- trace_irq_handler_exit(irq, action, res);
-
- if (chip->irq_eoi)
- chip->irq_eoi(&desc->irq_data);
-}
-
static void
__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
int is_chained, const char *name)
@@ -1030,7 +1009,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
const char *name)
{
- scoped_irqdesc_get_and_lock(irq, 0)
+ scoped_irqdesc_get_and_buslock(irq, 0)
__irq_do_set_handler(scoped_irqdesc, handle, is_chained, name);
}
EXPORT_SYMBOL_GPL(__irq_set_handler);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e103451243a0..786f5570a640 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,7 +133,15 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
*/
atomic_inc(&desc->threads_active);
- wake_up_process(action->thread);
+ /*
+ * This might be a premature wakeup before the thread reached the
+ * thread function and set the IRQTF_READY bit. It's waiting in
+ * kthread code with state UNINTERRUPTIBLE. Once it reaches the
+ * thread function it waits with INTERRUPTIBLE. The wakeup is not
+ * lost in that case because the thread is guaranteed to observe
+ * the RUN flag before it goes to sleep in wait_for_interrupt().
+ */
+ wake_up_state(action->thread, TASK_INTERRUPTIBLE);
}
static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index db714d3014b5..6acf268f005b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -879,8 +879,7 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
chip_bus_sync_unlock(desc);
}
-int irq_set_percpu_devid_partition(unsigned int irq,
- const struct cpumask *affinity)
+int irq_set_percpu_devid(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -892,31 +891,10 @@ int irq_set_percpu_devid_partition(unsigned int irq,
if (!desc->percpu_enabled)
return -ENOMEM;
- desc->percpu_affinity = affinity ? : cpu_possible_mask;
-
irq_set_percpu_devid_flags(irq);
return 0;
}
-int irq_set_percpu_devid(unsigned int irq)
-{
- return irq_set_percpu_devid_partition(irq, NULL);
-}
-
-int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (!desc || !desc->percpu_enabled)
- return -EINVAL;
-
- if (affinity)
- cpumask_copy(affinity, desc->percpu_affinity);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);
-
void kstat_incr_irq_this_cpu(unsigned int irq)
{
kstat_incr_irqs_this_cpu(irq_to_desc(irq));
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index dc473faadcc8..2652c4cfd877 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -867,13 +867,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
}
EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
-unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
+static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec)
{
struct irq_domain *domain;
- struct irq_data *irq_data;
- irq_hw_number_t hwirq;
- unsigned int type = IRQ_TYPE_NONE;
- int virq;
if (fwspec->fwnode) {
domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
@@ -883,6 +879,32 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
domain = irq_default_domain;
}
+ return domain;
+}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info)
+{
+ struct irq_domain *domain = fwspec_to_domain(fwspec);
+
+ memset(info, 0, sizeof(*info));
+
+ if (!domain || !domain->ops->get_fwspec_info)
+ return 0;
+
+ return domain->ops->get_fwspec_info(fwspec, info);
+}
+#endif
+
+unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
+{
+ unsigned int type = IRQ_TYPE_NONE;
+ struct irq_domain *domain;
+ struct irq_data *irq_data;
+ irq_hw_number_t hwirq;
+ int virq;
+
+ domain = fwspec_to_domain(fwspec);
if (!domain) {
pr_warn("no irq domain found for %s !\n",
of_node_full_name(to_of_node(fwspec->fwnode)));
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c94837382037..0bb29316b436 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -547,7 +547,7 @@ int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *noti
INIT_WORK(&notify->work, irq_affinity_notify);
}
- scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
old_notify = desc->affinity_notify;
desc->affinity_notify = notify;
}
@@ -659,7 +659,7 @@ void __disable_irq(struct irq_desc *desc)
static int __disable_irq_nosync(unsigned int irq)
{
- scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
__disable_irq(scoped_irqdesc);
return 0;
}
@@ -789,7 +789,7 @@ void __enable_irq(struct irq_desc *desc)
*/
void enable_irq(unsigned int irq)
{
- scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
struct irq_desc *desc = scoped_irqdesc;
if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq))
@@ -1001,7 +1001,6 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id)
static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
{
cpumask_var_t mask;
- bool valid = false;
if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
return;
@@ -1018,21 +1017,13 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *a
}
scoped_guard(raw_spinlock_irq, &desc->lock) {
- /*
- * This code is triggered unconditionally. Check the affinity
- * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
- */
- if (cpumask_available(desc->irq_common_data.affinity)) {
- const struct cpumask *m;
+ const struct cpumask *m;
- m = irq_data_get_effective_affinity_mask(&desc->irq_data);
- cpumask_copy(mask, m);
- valid = true;
- }
+ m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+ cpumask_copy(mask, m);
}
- if (valid)
- set_cpus_allowed_ptr(current, mask);
+ set_cpus_allowed_ptr(current, mask);
free_cpumask_var(mask);
}
#else
@@ -1239,7 +1230,10 @@ static int irq_thread(void *data)
irq_thread_set_ready(desc, action);
- sched_set_fifo(current);
+ if (action->handler == irq_forced_secondary_handler)
+ sched_set_fifo_secondary(current);
+ else
+ sched_set_fifo(current);
if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
&action->thread_flags))
@@ -1405,19 +1399,39 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
* references an already freed task_struct.
*/
new->thread = get_task_struct(t);
+
+ /*
+ * The affinity can not be established yet, but it will be once the
+ * interrupt is enabled. Delay and defer the actual setting to the
+ * thread itself once it is ready to run. In the meantime, prevent
+ * it from ever being re-affined directly by cpuset or
+ * housekeeping. The proper way to do it is to re-affine the whole
+ * vector.
+ */
+ kthread_bind_mask(t, cpu_possible_mask);
+
/*
- * Tell the thread to set its affinity. This is
- * important for shared interrupt handlers as we do
- * not invoke setup_affinity() for the secondary
- * handlers as everything is already set up. Even for
- * interrupts marked with IRQF_NO_BALANCE this is
- * correct as we want the thread to move to the cpu(s)
- * on which the requesting code placed the interrupt.
+ * Ensure the thread adjusts the affinity once it reaches the
+ * thread function.
*/
- set_bit(IRQTF_AFFINITY, &new->thread_flags);
+ new->thread_flags = BIT(IRQTF_AFFINITY);
+
return 0;
}
+static bool valid_percpu_irqaction(struct irqaction *old, struct irqaction *new)
+{
+ do {
+ if (cpumask_intersects(old->affinity, new->affinity) ||
+ old->percpu_dev_id == new->percpu_dev_id)
+ return false;
+
+ old = old->next;
+ } while (old);
+
+ return true;
+}
+
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
@@ -1438,6 +1452,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
struct irqaction *old, **old_ptr;
unsigned long flags, thread_mask = 0;
int ret, nested, shared = 0;
+ bool per_cpu_devid;
if (!desc)
return -EINVAL;
@@ -1447,6 +1462,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (!try_module_get(desc->owner))
return -ENODEV;
+ per_cpu_devid = irq_settings_is_per_cpu_devid(desc);
+
new->irq = irq;
/*
@@ -1554,13 +1571,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
unsigned int oldtype;
- if (irq_is_nmi(desc)) {
+ if (irq_is_nmi(desc) && !per_cpu_devid) {
pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
new->name, irq, desc->irq_data.chip->name);
ret = -EINVAL;
goto out_unlock;
}
+ if (per_cpu_devid && !valid_percpu_irqaction(old, new)) {
+ pr_err("Overlapping affinities for %s (irq %d) on irqchip %s.\n",
+ new->name, irq, desc->irq_data.chip->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* If nobody did set the configuration before, inherit
* the one provided by the requester.
@@ -1711,7 +1735,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (!(new->flags & IRQF_NO_AUTOEN) &&
irq_settings_can_autoenable(desc)) {
irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
- } else {
+ } else if (!per_cpu_devid) {
/*
* Shared interrupts do not go well with disabling
* auto enable. The sharing interrupt might request
@@ -2346,7 +2370,7 @@ void disable_percpu_nmi(unsigned int irq)
static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
+ struct irqaction *action, **action_ptr;
WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -2354,21 +2378,33 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
return NULL;
scoped_guard(raw_spinlock_irqsave, &desc->lock) {
- action = desc->action;
- if (!action || action->percpu_dev_id != dev_id) {
- WARN(1, "Trying to free already-free IRQ %d\n", irq);
- return NULL;
+ action_ptr = &desc->action;
+ for (;;) {
+ action = *action_ptr;
+
+ if (!action) {
+ WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ return NULL;
+ }
+
+ if (action->percpu_dev_id == dev_id)
+ break;
+
+ action_ptr = &action->next;
}
- if (!cpumask_empty(desc->percpu_enabled)) {
- WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
- irq, cpumask_first(desc->percpu_enabled));
+ if (cpumask_intersects(desc->percpu_enabled, action->affinity)) {
+ WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq,
+ cpumask_first_and(desc->percpu_enabled, action->affinity));
return NULL;
}
/* Found it - now remove it from the list of entries: */
- desc->action = NULL;
- desc->istate &= ~IRQS_NMI;
+ *action_ptr = action->next;
+
+ /* Demote from NMI if we killed the last action */
+ if (!desc->action)
+ desc->istate &= ~IRQS_NMI;
}
unregister_handler_proc(irq, action);
@@ -2442,17 +2478,49 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
return retval;
}
+static
+struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags,
+ const char *devname, const cpumask_t *affinity,
+ void __percpu *dev_id)
+{
+ struct irqaction *action;
+
+ if (!affinity)
+ affinity = cpu_possible_mask;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return NULL;
+
+ action->handler = handler;
+ action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
+ action->name = devname;
+ action->percpu_dev_id = dev_id;
+ action->affinity = affinity;
+
+ /*
+ * We allow some form of sharing for non-overlapping affinity
+ * masks. Obviously, covering all CPUs prevents any sharing in
+ * the first place.
+ */
+ if (!cpumask_equal(affinity, cpu_possible_mask))
+ action->flags |= IRQF_SHARED;
+
+ return action;
+}
+
/**
* __request_percpu_irq - allocate a percpu interrupt line
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
* @flags: Interrupt type flags (IRQF_TIMER only)
* @devname: An ascii name for the claiming device
+ * @affinity: A cpumask describing the target CPUs for this interrupt
* @dev_id: A percpu cookie passed back to the handler function
*
- * This call allocates interrupt resources and enables the interrupt on the
- * local CPU. If the interrupt is supposed to be enabled on other CPUs, it
- * has to be done on each CPU using enable_percpu_irq().
+ * This call allocates interrupt resources, but doesn't enable the interrupt
+ * on any CPU, as all percpu-devid interrupts are flagged with IRQ_NOAUTOEN.
+ * It has to be done on each CPU using enable_percpu_irq().
*
* @dev_id must be globally unique. It is a per-cpu variable, and
* the handler gets called with the interrupted CPU's instance of
@@ -2460,7 +2528,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
*/
int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *devname,
- void __percpu *dev_id)
+ const cpumask_t *affinity, void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -2477,15 +2545,10 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
if (flags && flags != IRQF_TIMER)
return -EINVAL;
- action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id);
if (!action)
return -ENOMEM;
- action->handler = handler;
- action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
- action->name = devname;
- action->percpu_dev_id = dev_id;
-
retval = irq_chip_pm_get(&desc->irq_data);
if (retval < 0) {
kfree(action);
@@ -2508,6 +2571,7 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq);
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
* @name: An ascii name for the claiming device
+ * @affinity: A cpumask describing the target CPUs for this interrupt
* @dev_id: A percpu cookie passed back to the handler function
*
* This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
@@ -2524,8 +2588,8 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq);
* If the interrupt line cannot be used to deliver NMIs, function
* will fail returning a negative value.
*/
-int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
- const char *name, void __percpu *dev_id)
+int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name,
+ const struct cpumask *affinity, void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -2542,20 +2606,16 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
!irq_supports_nmi(desc))
return -EINVAL;
- /* The line cannot already be NMI */
- if (irq_is_nmi(desc))
+ /* The line cannot be NMI already if the new request covers all CPUs */
+ if (irq_is_nmi(desc) &&
+ (!affinity || cpumask_equal(affinity, cpu_possible_mask)))
return -EINVAL;
- action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING,
+ name, affinity, dev_id);
if (!action)
return -ENOMEM;
- action->handler = handler;
- action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD
- | IRQF_NOBALANCING;
- action->name = name;
- action->percpu_dev_id = dev_id;
-
retval = irq_chip_pm_get(&desc->irq_data);
if (retval < 0)
goto err_out;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index e7ad99254841..68886881fe10 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -706,7 +706,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
int i, ret;
- if (irq_find_mapping(domain, hwirq) > 0)
+ if (irq_resolve_mapping(domain, hwirq))
return -EEXIST;
if (domain->parent) {
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 29c2404e743b..77258eafbf63 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -48,6 +48,8 @@ static int show_irq_affinity(int type, struct seq_file *m)
struct irq_desc *desc = irq_to_desc((long)m->private);
const struct cpumask *mask;
+ guard(raw_spinlock_irq)(&desc->lock);
+
switch (type) {
case AFFINITY:
case AFFINITY_LIST:
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 76f0940fb485..03d12e27189f 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) "KHO: " fmt
+#include <linux/cleanup.h>
#include <linux/cma.h>
#include <linux/count_zeros.h>
#include <linux/debugfs.h>
@@ -22,6 +23,7 @@
#include <asm/early_ioremap.h>
+#include "kexec_handover_internal.h"
/*
* KHO is tightly coupled with mm init and needs access to some of mm
* internal APIs.
@@ -67,10 +69,10 @@ early_param("kho", kho_parse_enable);
* Keep track of memory that is to be preserved across KHO.
*
* The serializing side uses two levels of xarrays to manage chunks of per-order
- * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
- * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
- * each bitmap will cover 16M of address space. Thus, for 16G of memory at most
- * 512K of bitmap memory will be needed for order 0.
+ * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order
+ * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0
+ * allocations each bitmap will cover 128M of address space. Thus, for 16G of
+ * memory at most 512K of bitmap memory will be needed for order 0.
*
* This approach is fully incremental, as the serialization progresses folios
* can continue be aggregated to the tracker. The final step, immediately prior
@@ -78,12 +80,14 @@ early_param("kho", kho_parse_enable);
* successor kernel to parse.
*/
-#define PRESERVE_BITS (512 * 8)
+#define PRESERVE_BITS (PAGE_SIZE * 8)
struct kho_mem_phys_bits {
DECLARE_BITMAP(preserve, PRESERVE_BITS);
};
+static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE);
+
struct kho_mem_phys {
/*
* Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
@@ -131,28 +135,28 @@ static struct kho_out kho_out = {
.finalized = false,
};
-static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
+static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
{
- void *elm, *res;
+ void *res = xa_load(xa, index);
+
+ if (res)
+ return res;
- elm = xa_load(xa, index);
- if (elm)
- return elm;
+ void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL);
- elm = kzalloc(sz, GFP_KERNEL);
if (!elm)
return ERR_PTR(-ENOMEM);
+ if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE)))
+ return ERR_PTR(-EINVAL);
+
res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
if (xa_is_err(res))
- res = ERR_PTR(xa_err(res));
-
- if (res) {
- kfree(elm);
+ return ERR_PTR(xa_err(res));
+ else if (res)
return res;
- }
- return elm;
+ return no_free_ptr(elm);
}
static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
@@ -167,12 +171,12 @@ static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
const unsigned long pfn_high = pfn >> order;
physxa = xa_load(&track->orders, order);
- if (!physxa)
- continue;
+ if (WARN_ON_ONCE(!physxa))
+ return;
bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
- if (!bits)
- continue;
+ if (WARN_ON_ONCE(!bits))
+ return;
clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
@@ -216,8 +220,7 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
}
}
- bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
- sizeof(*bits));
+ bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
if (IS_ERR(bits))
return PTR_ERR(bits);
@@ -345,15 +348,19 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
unsigned long order)
{
- struct khoser_mem_chunk *chunk;
+ struct khoser_mem_chunk *chunk __free(free_page) = NULL;
- chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ chunk = (void *)get_zeroed_page(GFP_KERNEL);
if (!chunk)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+
+ if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE)))
+ return ERR_PTR(-EINVAL);
+
chunk->hdr.order = order;
if (cur_chunk)
KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
- return chunk;
+ return no_free_ptr(chunk);
}
static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
@@ -374,14 +381,17 @@ static int kho_mem_serialize(struct kho_serialization *ser)
struct khoser_mem_chunk *chunk = NULL;
struct kho_mem_phys *physxa;
unsigned long order;
+ int err = -ENOMEM;
xa_for_each(&ser->track.orders, order, physxa) {
struct kho_mem_phys_bits *bits;
unsigned long phys;
chunk = new_chunk(chunk, order);
- if (!chunk)
+ if (IS_ERR(chunk)) {
+ err = PTR_ERR(chunk);
goto err_free;
+ }
if (!first_chunk)
first_chunk = chunk;
@@ -391,8 +401,10 @@ static int kho_mem_serialize(struct kho_serialization *ser)
if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
chunk = new_chunk(chunk, order);
- if (!chunk)
+ if (IS_ERR(chunk)) {
+ err = PTR_ERR(chunk);
goto err_free;
+ }
}
elm = &chunk->bitmaps[chunk->hdr.num_elms];
@@ -409,7 +421,7 @@ static int kho_mem_serialize(struct kho_serialization *ser)
err_free:
kho_mem_ser_free(first_chunk);
- return -ENOMEM;
+ return err;
}
static void __init deserialize_bitmap(unsigned int order,
@@ -465,8 +477,8 @@ static void __init kho_mem_deserialize(const void *fdt)
* area for early allocations that happen before page allocator is
* initialized.
*/
-static struct kho_scratch *kho_scratch;
-static unsigned int kho_scratch_cnt;
+struct kho_scratch *kho_scratch;
+unsigned int kho_scratch_cnt;
/*
* The scratch areas are scaled by default as percent of memory allocated from
@@ -752,6 +764,9 @@ int kho_preserve_folio(struct folio *folio)
const unsigned int order = folio_order(folio);
struct kho_mem_track *track = &kho_out.ser.track;
+ if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
+ return -EINVAL;
+
return __kho_preserve_order(track, pfn, order);
}
EXPORT_SYMBOL_GPL(kho_preserve_folio);
@@ -775,6 +790,11 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
unsigned long failed_pfn = 0;
int err = 0;
+ if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT))) {
+ return -EINVAL;
+ }
+
while (pfn < end_pfn) {
const unsigned int order =
min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
@@ -862,16 +882,17 @@ err_free:
return NULL;
}
-static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk)
+static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
+ unsigned short order)
{
struct kho_mem_track *track = &kho_out.ser.track;
unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
__kho_unpreserve(track, pfn, pfn + 1);
- for (int i = 0; chunk->phys[i]; i++) {
+ for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
pfn = PHYS_PFN(chunk->phys[i]);
- __kho_unpreserve(track, pfn, pfn + 1);
+ __kho_unpreserve(track, pfn, pfn + (1 << order));
}
}
@@ -882,7 +903,7 @@ static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc)
while (chunk) {
struct kho_vmalloc_chunk *tmp = chunk;
- kho_vmalloc_unpreserve_chunk(chunk);
+ kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order);
chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
free_page((unsigned long)tmp);
@@ -992,7 +1013,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
while (chunk) {
struct page *page;
- for (int i = 0; chunk->phys[i]; i++) {
+ for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
phys_addr_t phys = chunk->phys[i];
if (idx + contig_pages > total_pages)
diff --git a/kernel/kexec_handover_debug.c b/kernel/kexec_handover_debug.c
new file mode 100644
index 000000000000..6efb696f5426
--- /dev/null
+++ b/kernel/kexec_handover_debug.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover_debug.c - kexec handover optional debug functionality
+ * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include "kexec_handover_internal.h"
+
+bool kho_scratch_overlap(phys_addr_t phys, size_t size)
+{
+ phys_addr_t scratch_start, scratch_end;
+ unsigned int i;
+
+ for (i = 0; i < kho_scratch_cnt; i++) {
+ scratch_start = kho_scratch[i].addr;
+ scratch_end = kho_scratch[i].addr + kho_scratch[i].size;
+
+ if (phys < scratch_end && (phys + size) > scratch_start)
+ return true;
+ }
+
+ return false;
+}
diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h
new file mode 100644
index 000000000000..3c3c7148ceed
--- /dev/null
+++ b/kernel/kexec_handover_internal.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H
+#define LINUX_KEXEC_HANDOVER_INTERNAL_H
+
+#include <linux/kexec_handover.h>
+#include <linux/types.h>
+
+extern struct kho_scratch *kho_scratch;
+extern unsigned int kho_scratch_cnt;
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUG
+bool kho_scratch_overlap(phys_addr_t phys, size_t size);
+#else
+static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */
+
+#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 31b072e8d427..99a3808d086f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
- unsigned long flags;
-
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ set_cpus_allowed_force(p, mask);
+
/* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
@@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
{
struct kthread *kthread = to_kthread(p);
cpumask_var_t affinity;
- unsigned long flags;
int ret = 0;
if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
@@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
kthread_fetch_affinity(kthread, affinity);
- /* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, affinity);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ set_cpus_allowed_force(p, affinity);
mutex_unlock(&kthreads_hotplug_lock);
out:
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 53d51ed619a3..4c0a9c18d0b2 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -18,3 +18,15 @@ config LIVEPATCH
module uses the interface provided by this option to register
a patch, causing calls to patched functions to be redirected
to new function code contained in the patch module.
+
+config HAVE_KLP_BUILD
+ bool
+ help
+ Arch supports klp-build
+
+config KLP_BUILD
+ def_bool y
+ depends on LIVEPATCH && HAVE_KLP_BUILD
+ select OBJTOOL
+ help
+ Enable klp-build support
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 0e73fac55f8e..0044a8125013 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -217,14 +217,14 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
if (sym->st_shndx != SHN_LIVEPATCH) {
- pr_err("symbol %s is not marked as a livepatch symbol\n",
- strtab + sym->st_name);
+ pr_err("symbol %s at rela sec %u idx %d is not marked as a livepatch symbol\n",
+ strtab + sym->st_name, symndx, i);
return -EINVAL;
}
/* Format: .klp.sym.sym_objname.sym_name,sympos */
cnt = sscanf(strtab + sym->st_name,
- ".klp.sym.%55[^.].%511[^,],%lu",
+ KLP_SYM_PREFIX "%55[^.].%511[^,],%lu",
sym_objname, sym_name, &sympos);
if (cnt != 3) {
pr_err("symbol %s has an incorrectly formatted name\n",
@@ -303,7 +303,7 @@ static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
* See comment in klp_resolve_symbols() for an explanation
* of the selected field width value.
*/
- cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]",
+ cnt = sscanf(shstrtab + sec->sh_name, KLP_RELOC_SEC_PREFIX "%55[^.]",
sec_objname);
if (cnt != 1) {
pr_err("section %s has an incorrectly formatted name\n",
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 949103fd8e9b..2c6b02d4699b 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -78,16 +78,8 @@ void debug_mutex_unlock(struct mutex *lock)
}
}
-void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key)
+void debug_mutex_init(struct mutex *lock)
{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
-#endif
lock->magic = lock;
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index de7d6702cd96..2a1d165b3167 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -43,8 +43,7 @@
# define MUTEX_WARN_ON(cond)
#endif
-void
-__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+static void __mutex_init_generic(struct mutex *lock)
{
atomic_long_set(&lock->owner, 0);
raw_spin_lock_init(&lock->wait_lock);
@@ -52,10 +51,8 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
#endif
-
- debug_mutex_init(lock, name, key);
+ debug_mutex_init(lock);
}
-EXPORT_SYMBOL(__mutex_init);
static inline struct task_struct *__owner_task(unsigned long owner)
{
@@ -142,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock)
* There is nothing that would stop spreading the lockdep annotations outwards
* except more code.
*/
+void mutex_init_generic(struct mutex *lock)
+{
+ __mutex_init_generic(lock);
+}
+EXPORT_SYMBOL(mutex_init_generic);
/*
* Optimistic trylock that only works in the uncontended case. Make sure to
@@ -166,7 +168,21 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
}
-#endif
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+ __mutex_init_generic(lock);
+
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_init_lockep);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
{
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 2e8080a9bee3..9ad4da8cea00 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -59,8 +59,7 @@ extern void debug_mutex_add_waiter(struct mutex *lock,
extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task);
extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
+extern void debug_mutex_init(struct mutex *lock);
#else /* CONFIG_DEBUG_MUTEXES */
# define debug_mutex_lock_common(lock, waiter) do { } while (0)
# define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
@@ -68,6 +67,6 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
# define debug_mutex_unlock(lock) do { } while (0)
-# define debug_mutex_init(lock, name, key) do { } while (0)
+# define debug_mutex_init(lock) do { } while (0)
#endif /* !CONFIG_DEBUG_MUTEXES */
#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index bafd5af98eae..59dbd29cb219 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -515,13 +515,11 @@ void rt_mutex_debug_task_free(struct task_struct *task)
#ifdef CONFIG_PREEMPT_RT
/* Mutexes */
-void __mutex_rt_init(struct mutex *mutex, const char *name,
- struct lock_class_key *key)
+static void __mutex_rt_init_generic(struct mutex *mutex)
{
+ rt_mutex_base_init(&mutex->rtmutex);
debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
- lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
}
-EXPORT_SYMBOL(__mutex_rt_init);
static __always_inline int __mutex_lock_common(struct mutex *lock,
unsigned int state,
@@ -542,6 +540,13 @@ static __always_inline int __mutex_lock_common(struct mutex *lock,
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key)
+{
+ __mutex_rt_init_generic(mutex);
+ lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_rt_init_lockdep);
+
void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
@@ -598,6 +603,12 @@ int __sched _mutex_trylock_nest_lock(struct mutex *lock,
EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock);
#else /* CONFIG_DEBUG_LOCK_ALLOC */
+void mutex_rt_init_generic(struct mutex *mutex)
+{
+ __mutex_rt_init_generic(mutex);
+}
+EXPORT_SYMBOL(mutex_rt_init_generic);
+
void __sched mutex_lock(struct mutex *lock)
{
__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 87b03d2e41db..2338b3adfb55 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -184,8 +184,8 @@ void do_raw_read_unlock(rwlock_t *lock)
static inline void debug_write_lock_before(rwlock_t *lock)
{
RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
- RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
- RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index c66b26184936..7b3ec2fa6e7c 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3675,24 +3675,35 @@ static int idempotent_wait_for_completion(struct idempotent *u)
static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
{
+ bool compressed = !!(flags & MODULE_INIT_COMPRESSED_FILE);
struct load_info info = { };
void *buf = NULL;
int len;
+ int err;
- len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE);
+ len = kernel_read_file(f, 0, &buf, INT_MAX, NULL,
+ compressed ? READING_MODULE_COMPRESSED :
+ READING_MODULE);
if (len < 0) {
mod_stat_inc(&failed_kreads);
return len;
}
- if (flags & MODULE_INIT_COMPRESSED_FILE) {
- int err = module_decompress(&info, buf, len);
+ if (compressed) {
+ err = module_decompress(&info, buf, len);
vfree(buf); /* compressed data is no longer needed */
if (err) {
mod_stat_inc(&failed_decompress);
mod_stat_add_long(len, &invalid_decompress_bytes);
return err;
}
+ err = security_kernel_post_read_file(f, (char *)info.hdr, info.len,
+ READING_MODULE);
+ if (err) {
+ mod_stat_inc(&failed_kreads);
+ free_copy(&info, flags);
+ return err;
+ }
} else {
info.hdr = buf;
info.len = len;
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index c1fb2bad6d72..bdc3c86231d3 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -1,7 +1,10 @@
// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#include <linux/ns_common.h>
+#include <linux/nstree.h>
#include <linux/proc_ns.h>
+#include <linux/user_namespace.h>
#include <linux/vfsdebug.h>
#ifdef CONFIG_DEBUG_VFS
@@ -52,26 +55,257 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
{
+ int ret = 0;
+
refcount_set(&ns->__ns_ref, 1);
ns->stashed = NULL;
ns->ops = ops;
ns->ns_id = 0;
ns->ns_type = ns_type;
- RB_CLEAR_NODE(&ns->ns_tree_node);
- INIT_LIST_HEAD(&ns->ns_list_node);
+ ns_tree_node_init(&ns->ns_tree_node);
+ ns_tree_node_init(&ns->ns_unified_node);
+ ns_tree_node_init(&ns->ns_owner_node);
+ ns_tree_root_init(&ns->ns_owner_root);
#ifdef CONFIG_DEBUG_VFS
ns_debug(ns, ops);
#endif
- if (inum) {
+ if (inum)
ns->inum = inum;
- return 0;
- }
- return proc_alloc_inum(&ns->inum);
+ else
+ ret = proc_alloc_inum(&ns->inum);
+ if (ret)
+ return ret;
+ /*
+ * Tree ref starts at 0. It's incremented when namespace enters
+ * active use (installed in nsproxy) and decremented when all
+ * active uses are gone. Initial namespaces are always active.
+ */
+ if (is_ns_init_inum(ns))
+ atomic_set(&ns->__ns_ref_active, 1);
+ else
+ atomic_set(&ns->__ns_ref_active, 0);
+ return 0;
}
void __ns_common_free(struct ns_common *ns)
{
proc_free_inum(ns->inum);
}
+
+struct ns_common *__must_check ns_owner(struct ns_common *ns)
+{
+ struct user_namespace *owner;
+
+ if (unlikely(!ns->ops))
+ return NULL;
+ VFS_WARN_ON_ONCE(!ns->ops->owner);
+ owner = ns->ops->owner(ns);
+ VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
+ if (!owner)
+ return NULL;
+ /* Skip init_user_ns as it's always active */
+ if (owner == &init_user_ns)
+ return NULL;
+ return to_ns_common(owner);
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down.
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ * net_ns pid_ns
+ * \ /
+ * + +
+ * user_ns1 (2)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + + +
+ * user_ns2 (3)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + - +
+ * user_ns2 (2)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * The iteration stops once we reach a namespace that still has active
+ * references.
+ */
+void __ns_ref_active_put(struct ns_common *ns)
+{
+ /* Initial namespaces are always active. */
+ if (is_ns_init_id(ns))
+ return;
+
+ if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+ return;
+ }
+
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
+
+ for (;;) {
+ ns = ns_owner(ns);
+ if (!ns)
+ return;
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+ return;
+ }
+ }
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down. This makes it possible to efficiently
+ * resurrect a namespace tree:
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ * net_ns pid_ns
+ * \ /
+ * + +
+ * user_ns1 (2)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + + +
+ * user_ns2 (3)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + - +
+ * user_ns2 (2)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * Assume the whole tree is dead but all namespaces are still active:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * - - -
+ * user_ns2 (0)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
+ *
+ * net_ns pid_ns
+ * \ /
+ * + -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * - + -
+ * user_ns2 (0)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If net_ns had a zero reference count and we bumped it we also need to
+ * take another reference on its owning user namespace. Similarly, if
+ * pid_ns had a zero reference count it also needs to take another
+ * reference on its owning user namespace. So both net_ns and pid_ns
+ * will each have their own reference on the owning user namespace.
+ *
+ * If the owning user namespace user_ns1 had a zero reference count then
+ * it also needs to take another reference on its owning user namespace
+ * and so on.
+ */
+void __ns_ref_active_get(struct ns_common *ns)
+{
+ int prev;
+
+ /* Initial namespaces are always active. */
+ if (is_ns_init_id(ns))
+ return;
+
+ /* If we didn't resurrect the namespace we're done. */
+ prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+ VFS_WARN_ON_ONCE(prev < 0);
+ if (likely(prev))
+ return;
+
+ /*
+ * We did resurrect it. Walk the ownership hierarchy upwards
+ * until we found an owning user namespace that is active.
+ */
+ for (;;) {
+ ns = ns_owner(ns);
+ if (!ns)
+ return;
+
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+ VFS_WARN_ON_ONCE(prev < 0);
+ if (likely(prev))
+ return;
+ }
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 19aa64ab08c8..259c4b4f1eeb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
#include <linux/syscalls.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
+#include <linux/nstree.h>
static struct kmem_cache *nsproxy_cachep;
@@ -59,6 +60,25 @@ static inline struct nsproxy *create_nsproxy(void)
return nsproxy;
}
+static inline void nsproxy_free(struct nsproxy *ns)
+{
+ put_mnt_ns(ns->mnt_ns);
+ put_uts_ns(ns->uts_ns);
+ put_ipc_ns(ns->ipc_ns);
+ put_pid_ns(ns->pid_ns_for_children);
+ put_time_ns(ns->time_ns);
+ put_time_ns(ns->time_ns_for_children);
+ put_cgroup_ns(ns->cgroup_ns);
+ put_net(ns->net_ns);
+ kmem_cache_free(nsproxy_cachep, ns);
+}
+
+void deactivate_nsproxy(struct nsproxy *ns)
+{
+ nsproxy_ns_active_put(ns);
+ nsproxy_free(ns);
+}
+
/*
* Create new nsproxy and all of its the associated namespaces.
* Return the newly created nsproxy. Do not attach this to the task,
@@ -179,23 +199,11 @@ int copy_namespaces(u64 flags, struct task_struct *tsk)
if ((flags & CLONE_VM) == 0)
timens_on_fork(new_ns, tsk);
+ nsproxy_ns_active_get(new_ns);
tsk->nsproxy = new_ns;
return 0;
}
-void free_nsproxy(struct nsproxy *ns)
-{
- put_mnt_ns(ns->mnt_ns);
- put_uts_ns(ns->uts_ns);
- put_ipc_ns(ns->ipc_ns);
- put_pid_ns(ns->pid_ns_for_children);
- put_time_ns(ns->time_ns);
- put_time_ns(ns->time_ns_for_children);
- put_cgroup_ns(ns->cgroup_ns);
- put_net(ns->net_ns);
- kmem_cache_free(nsproxy_cachep, ns);
-}
-
/*
* Called from unshare. Unshare all the namespaces part of nsproxy.
* On success, returns the new nsproxy.
@@ -232,6 +240,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
might_sleep();
+ if (new)
+ nsproxy_ns_active_get(new);
+
task_lock(p);
ns = p->nsproxy;
p->nsproxy = new;
@@ -241,11 +252,27 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
put_nsproxy(ns);
}
-void exit_task_namespaces(struct task_struct *p)
+void exit_nsproxy_namespaces(struct task_struct *p)
{
switch_task_namespaces(p, NULL);
}
+void switch_cred_namespaces(const struct cred *old, const struct cred *new)
+{
+ ns_ref_active_get(new->user_ns);
+ ns_ref_active_put(old->user_ns);
+}
+
+void get_cred_namespaces(struct task_struct *tsk)
+{
+ ns_ref_active_get(tsk->real_cred->user_ns);
+}
+
+void exit_cred_namespaces(struct task_struct *tsk)
+{
+ ns_ref_active_put(tsk->real_cred->user_ns);
+}
+
int exec_task_namespaces(void)
{
struct task_struct *tsk = current;
@@ -315,7 +342,7 @@ static void put_nsset(struct nsset *nsset)
if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
free_fs_struct(nsset->fs);
if (nsset->nsproxy)
- free_nsproxy(nsset->nsproxy);
+ nsproxy_free(nsset->nsproxy);
}
static int prepare_nsset(unsigned flags, struct nsset *nsset)
diff --git a/kernel/nstree.c b/kernel/nstree.c
index b24a320a11a6..f36c59e6951d 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -1,140 +1,261 @@
// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#include <linux/nstree.h>
#include <linux/proc_ns.h>
+#include <linux/rculist.h>
#include <linux/vfsdebug.h>
+#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
-/**
- * struct ns_tree - Namespace tree
- * @ns_tree: Rbtree of namespaces of a particular type
- * @ns_list: Sequentially walkable list of all namespaces of this type
- * @ns_tree_lock: Seqlock to protect the tree and list
- * @type: type of namespaces in this tree
- */
-struct ns_tree {
- struct rb_root ns_tree;
- struct list_head ns_list;
- seqlock_t ns_tree_lock;
- int type;
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
+
+DEFINE_LOCK_GUARD_0(ns_tree_writer,
+ write_seqlock(&ns_tree_lock),
+ write_sequnlock(&ns_tree_lock))
+
+DEFINE_LOCK_GUARD_0(ns_tree_locked_reader,
+ read_seqlock_excl(&ns_tree_lock),
+ read_sequnlock_excl(&ns_tree_lock))
+
+static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head),
};
-struct ns_tree mnt_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock),
- .type = CLONE_NEWNS,
+struct ns_tree_root mnt_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head),
};
-struct ns_tree net_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock),
- .type = CLONE_NEWNET,
+struct ns_tree_root net_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head),
};
EXPORT_SYMBOL_GPL(net_ns_tree);
-struct ns_tree uts_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock),
- .type = CLONE_NEWUTS,
+struct ns_tree_root uts_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head),
};
-struct ns_tree user_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock),
- .type = CLONE_NEWUSER,
+struct ns_tree_root user_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head),
};
-struct ns_tree ipc_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock),
- .type = CLONE_NEWIPC,
+struct ns_tree_root ipc_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head),
};
-struct ns_tree pid_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock),
- .type = CLONE_NEWPID,
+struct ns_tree_root pid_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head),
};
-struct ns_tree cgroup_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock),
- .type = CLONE_NEWCGROUP,
+struct ns_tree_root cgroup_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head),
};
-struct ns_tree time_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock),
- .type = CLONE_NEWTIME,
+struct ns_tree_root time_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head),
};
-DEFINE_COOKIE(namespace_cookie);
+/**
+ * ns_tree_node_init - Initialize a namespace tree node
+ * @node: The node to initialize
+ *
+ * Initializes both the rbtree node and list entry.
+ */
+void ns_tree_node_init(struct ns_tree_node *node)
+{
+ RB_CLEAR_NODE(&node->ns_node);
+ INIT_LIST_HEAD(&node->ns_list_entry);
+}
+
+/**
+ * ns_tree_root_init - Initialize a namespace tree root
+ * @root: The root to initialize
+ *
+ * Initializes both the rbtree root and list head.
+ */
+void ns_tree_root_init(struct ns_tree_root *root)
+{
+ root->ns_rb = RB_ROOT;
+ INIT_LIST_HEAD(&root->ns_list_head);
+}
+
+/**
+ * ns_tree_node_empty - Check if a namespace tree node is empty
+ * @node: The node to check
+ *
+ * Returns true if the node is not in any tree.
+ */
+bool ns_tree_node_empty(const struct ns_tree_node *node)
+{
+ return RB_EMPTY_NODE(&node->ns_node);
+}
+
+/**
+ * ns_tree_node_add - Add a node to a namespace tree
+ * @node: The node to add
+ * @root: The tree root to add to
+ * @cmp: Comparison function for rbtree insertion
+ *
+ * Adds the node to both the rbtree and the list, maintaining sorted order.
+ * The list is maintained in the same order as the rbtree to enable efficient
+ * iteration.
+ *
+ * Returns: NULL if insertion succeeded, existing node if duplicate found
+ */
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+ struct ns_tree_root *root,
+ int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node *ret, *prev;
+
+ /* Add to rbtree */
+ ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp);
+
+ /* Add to list in sorted order */
+ prev = rb_prev(&node->ns_node);
+ if (!prev) {
+ /* No previous node, add at head */
+ list_add_rcu(&node->ns_list_entry, &root->ns_list_head);
+ } else {
+ /* Add after previous node */
+ struct ns_tree_node *prev_node;
+ prev_node = rb_entry(prev, struct ns_tree_node, ns_node);
+ list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry);
+ }
+
+ return ret;
+}
+
+/**
+ * ns_tree_node_del - Remove a node from a namespace tree
+ * @node: The node to remove
+ * @root: The tree root to remove from
+ *
+ * Removes the node from both the rbtree and the list atomically.
+ */
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root)
+{
+ rb_erase(&node->ns_node, &root->ns_rb);
+ RB_CLEAR_NODE(&node->ns_node);
+ list_bidir_del_rcu(&node->ns_list_entry);
+}
static inline struct ns_common *node_to_ns(const struct rb_node *node)
{
if (!node)
return NULL;
- return rb_entry(node, struct ns_common, ns_tree_node);
+ return rb_entry(node, struct ns_common, ns_tree_node.ns_node);
}
-static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
+static inline struct ns_common *node_to_ns_unified(const struct rb_node *node)
{
- struct ns_common *ns_a = node_to_ns(a);
- struct ns_common *ns_b = node_to_ns(b);
- u64 ns_id_a = ns_a->ns_id;
- u64 ns_id_b = ns_b->ns_id;
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_unified_node.ns_node);
+}
- if (ns_id_a < ns_id_b)
+static inline struct ns_common *node_to_ns_owner(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_owner_node.ns_node);
+}
+
+static int ns_id_cmp(u64 id_a, u64 id_b)
+{
+ if (id_a < id_b)
return -1;
- if (ns_id_a > ns_id_b)
+ if (id_a > id_b)
return 1;
return 0;
}
-void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
+static int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id);
+}
+
+static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id);
+}
+
+static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b)
{
- struct rb_node *node, *prev;
+ return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id);
+}
+
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree)
+{
+ struct rb_node *node;
+ const struct proc_ns_operations *ops = ns->ops;
VFS_WARN_ON_ONCE(!ns->ns_id);
- write_seqlock(&ns_tree->ns_tree_lock);
+ guard(ns_tree_writer)();
- VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+ /* Add to per-type tree and list */
+ node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp);
- node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
- /*
- * If there's no previous entry simply add it after the
- * head and if there is add it after the previous entry.
- */
- prev = rb_prev(&ns->ns_tree_node);
- if (!prev)
- list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list);
- else
- list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
+ /* Add to unified tree and list */
+ ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified);
+
+ /* Add to owner's tree if applicable */
+ if (ops) {
+ struct user_namespace *user_ns;
- write_sequnlock(&ns_tree->ns_tree_lock);
+ VFS_WARN_ON_ONCE(!ops->owner);
+ user_ns = ops->owner(ns);
+ if (user_ns) {
+ struct ns_common *owner = &user_ns->ns;
+ VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+ /* Insert into owner's tree and list */
+ ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner);
+ } else {
+ /* Only the initial user namespace doesn't have an owner. */
+ VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns));
+ }
+ }
VFS_WARN_ON_ONCE(node);
}
-void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree)
{
- VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
- VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
- VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+ const struct proc_ns_operations *ops = ns->ops;
+ struct user_namespace *user_ns;
+
+ VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node));
+ VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry));
+
+ write_seqlock(&ns_tree_lock);
+
+ /* Remove from per-type tree and list */
+ ns_tree_node_del(&ns->ns_tree_node, ns_tree);
+
+ /* Remove from unified tree and list */
+ ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root);
- write_seqlock(&ns_tree->ns_tree_lock);
- rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
- list_bidir_del_rcu(&ns->ns_list_node);
- RB_CLEAR_NODE(&ns->ns_tree_node);
- write_sequnlock(&ns_tree->ns_tree_lock);
+ /* Remove from owner's tree if applicable */
+ if (ops) {
+ user_ns = ops->owner(ns);
+ if (user_ns) {
+ struct ns_common *owner = &user_ns->ns;
+ ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root);
+ }
+ }
+
+ write_sequnlock(&ns_tree_lock);
}
EXPORT_SYMBOL_GPL(__ns_tree_remove);
@@ -150,8 +271,19 @@ static int ns_find(const void *key, const struct rb_node *node)
return 0;
}
+static int ns_find_unified(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns_unified(node);
-static struct ns_tree *ns_tree_from_type(int ns_type)
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+static struct ns_tree_root *ns_tree_from_type(int ns_type)
{
switch (ns_type) {
case CLONE_NEWCGROUP:
@@ -175,73 +307,507 @@ static struct ns_tree *ns_tree_from_type(int ns_type)
return NULL;
}
-struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id)
{
- struct ns_tree *ns_tree;
struct rb_node *node;
unsigned int seq;
- RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+ do {
+ seq = read_seqbegin(&ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree_lock, seq));
+
+ return node_to_ns_unified(node);
+}
+
+static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ struct ns_tree_root *ns_tree;
+ struct rb_node *node;
+ unsigned int seq;
ns_tree = ns_tree_from_type(ns_type);
if (!ns_tree)
return NULL;
do {
- seq = read_seqbegin(&ns_tree->ns_tree_lock);
- node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
+ seq = read_seqbegin(&ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find);
if (node)
break;
- } while (read_seqretry(&ns_tree->ns_tree_lock, seq));
+ } while (read_seqretry(&ns_tree_lock, seq));
- if (!node)
- return NULL;
+ return node_to_ns(node);
+}
- VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type);
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
- return node_to_ns(node);
+ if (ns_type)
+ return __ns_tree_lookup_rcu(ns_id, ns_type);
+
+ return __ns_unified_tree_lookup_rcu(ns_id);
}
/**
- * ns_tree_adjoined_rcu - find the next/previous namespace in the same
+ * __ns_tree_adjoined_rcu - find the next/previous namespace in the same
* tree
* @ns: namespace to start from
+ * @ns_tree: namespace tree to search in
* @previous: if true find the previous namespace, otherwise the next
*
* Find the next or previous namespace in the same tree as @ns. If
* there is no next/previous namespace, -ENOENT is returned.
*/
struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
- struct ns_tree *ns_tree, bool previous)
+ struct ns_tree_root *ns_tree, bool previous)
{
struct list_head *list;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
if (previous)
- list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node));
+ list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry));
else
- list = rcu_dereference(list_next_rcu(&ns->ns_list_node));
- if (list_is_head(list, &ns_tree->ns_list))
+ list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry));
+ if (list_is_head(list, &ns_tree->ns_list_head))
return ERR_PTR(-ENOENT);
- VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type);
-
- return list_entry_rcu(list, struct ns_common, ns_list_node);
+ return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry);
}
/**
- * ns_tree_gen_id - generate a new namespace id
+ * __ns_tree_gen_id - generate a new namespace id
* @ns: namespace to generate id for
+ * @id: if non-zero, this is the initial namespace and this is a fixed id
*
* Generates a new namespace id and assigns it to the namespace. All
* namespaces types share the same id space and thus can be compared
* directly. IOW, when two ids of two namespace are equal, they are
* identical.
*/
-u64 ns_tree_gen_id(struct ns_common *ns)
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id)
{
- guard(preempt)();
- ns->ns_id = gen_cookie_next(&namespace_cookie);
+ static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1);
+
+ if (id)
+ ns->ns_id = id;
+ else
+ ns->ns_id = atomic64_inc_return(&namespace_cookie);
return ns->ns_id;
}
+
+struct klistns {
+ u64 __user *uns_ids;
+ u32 nr_ns_ids;
+ u64 last_ns_id;
+ u64 user_ns_id;
+ u32 ns_type;
+ struct user_namespace *user_ns;
+ bool userns_capable;
+ struct ns_common *first_ns;
+};
+
+static void __free_klistns_free(const struct klistns *kls)
+{
+ if (kls->user_ns_id != LISTNS_CURRENT_USER)
+ put_user_ns(kls->user_ns);
+ if (kls->first_ns && kls->first_ns->ops)
+ kls->first_ns->ops->put(kls->first_ns);
+}
+
+#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS)
+
+static int copy_ns_id_req(const struct ns_id_req __user *req,
+ struct ns_id_req *kreq)
+{
+ int ret;
+ size_t usize;
+
+ BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0);
+
+ ret = get_user(usize, &req->size);
+ if (ret)
+ return -EFAULT;
+ if (unlikely(usize > PAGE_SIZE))
+ return -E2BIG;
+ if (unlikely(usize < NS_ID_REQ_SIZE_VER0))
+ return -EINVAL;
+ memset(kreq, 0, sizeof(*kreq));
+ ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
+ if (ret)
+ return ret;
+ if (kreq->spare != 0)
+ return -EINVAL;
+ if (kreq->ns_type & ~NS_ALL)
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq,
+ u64 __user *ns_ids, size_t nr_ns_ids)
+{
+ kls->last_ns_id = kreq->ns_id;
+ kls->user_ns_id = kreq->user_ns_id;
+ kls->nr_ns_ids = nr_ns_ids;
+ kls->ns_type = kreq->ns_type;
+ kls->uns_ids = ns_ids;
+ return 0;
+}
+
+/*
+ * Lookup a namespace owned by owner with id >= ns_id.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner)
+{
+ struct ns_common *ret = NULL;
+ struct rb_node *node;
+
+ VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+ guard(ns_tree_locked_reader)();
+
+ node = owner->ns_owner_root.ns_rb.rb_node;
+ while (node) {
+ struct ns_common *ns;
+
+ ns = node_to_ns_owner(node);
+ if (ns_id <= ns->ns_id) {
+ ret = ns;
+ if (ns_id == ns->ns_id)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+
+ if (ret)
+ ret = ns_get_unless_inactive(ret);
+ return ret;
+}
+
+static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type)
+{
+ struct ns_common *ns;
+
+ guard(rcu)();
+ ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type);
+ if (!ns)
+ return NULL;
+
+ if (!ns_get_unless_inactive(ns))
+ return NULL;
+
+ return ns;
+}
+
+static inline bool __must_check ns_requested(const struct klistns *kls,
+ const struct ns_common *ns)
+{
+ return !kls->ns_type || (kls->ns_type & ns->ns_type);
+}
+
+static inline bool __must_check may_list_ns(const struct klistns *kls,
+ struct ns_common *ns)
+{
+ if (kls->user_ns) {
+ if (kls->userns_capable)
+ return true;
+ } else {
+ struct ns_common *owner;
+ struct user_namespace *user_ns;
+
+ owner = ns_owner(ns);
+ if (owner)
+ user_ns = to_user_ns(owner);
+ else
+ user_ns = &init_user_ns;
+ if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN))
+ return true;
+ }
+
+ if (is_current_namespace(ns))
+ return true;
+
+ if (ns->ns_type != CLONE_NEWUSER)
+ return false;
+
+ if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN))
+ return true;
+
+ return false;
+}
+
+static inline void ns_put(struct ns_common *ns)
+{
+ if (ns && ns->ops)
+ ns->ops->put(ns);
+}
+
+DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T))
+
+static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls,
+ struct ns_common *candidate)
+{
+ struct ns_common *ns __free(ns_put) = NULL;
+
+ if (!ns_requested(kls, candidate))
+ return NULL;
+
+ ns = ns_get_unless_inactive(candidate);
+ if (!ns)
+ return NULL;
+
+ if (!may_list_ns(kls, ns))
+ return NULL;
+
+ return no_free_ptr(ns);
+}
+
+static ssize_t do_listns_userns(struct klistns *kls)
+{
+ u64 __user *ns_ids = kls->uns_ids;
+ size_t nr_ns_ids = kls->nr_ns_ids;
+ struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL;
+ const struct list_head *head;
+ ssize_t ret;
+
+ VFS_WARN_ON_ONCE(!kls->user_ns_id);
+
+ if (kls->user_ns_id == LISTNS_CURRENT_USER)
+ ns = to_ns_common(current_user_ns());
+ else if (kls->user_ns_id)
+ ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER);
+ if (!ns)
+ return -EINVAL;
+ kls->user_ns = to_user_ns(ns);
+
+ /*
+ * Use the rbtree to find the first namespace we care about and
+ * then use it's list entry to iterate from there.
+ */
+ if (kls->last_ns_id) {
+ kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns);
+ if (!kls->first_ns)
+ return -ENOENT;
+ first_ns = kls->first_ns;
+ }
+
+ ret = 0;
+ head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head;
+ kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN);
+
+ rcu_read_lock();
+
+ if (!first_ns)
+ first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry);
+
+ ns = first_ns;
+ list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) {
+ struct ns_common *valid;
+
+ if (!nr_ns_ids)
+ break;
+
+ valid = legitimize_ns(kls, ns);
+ if (!valid)
+ continue;
+
+ rcu_read_unlock();
+
+ ns_put(prev);
+ prev = valid;
+
+ if (put_user(valid->ns_id, ns_ids + ret)) {
+ ns_put(prev);
+ return -EFAULT;
+ }
+
+ nr_ns_ids--;
+ ret++;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+ ns_put(prev);
+ return ret;
+}
+
+/*
+ * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type)
+{
+ struct ns_common *ret = NULL;
+ struct ns_tree_root *ns_tree = NULL;
+ struct rb_node *node;
+
+ if (ns_type) {
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+ }
+
+ guard(ns_tree_locked_reader)();
+
+ if (ns_tree)
+ node = ns_tree->ns_rb.rb_node;
+ else
+ node = ns_unified_root.ns_rb.rb_node;
+
+ while (node) {
+ struct ns_common *ns;
+
+ if (ns_type)
+ ns = node_to_ns(node);
+ else
+ ns = node_to_ns_unified(node);
+
+ if (ns_id <= ns->ns_id) {
+ if (ns_type)
+ ret = node_to_ns(node);
+ else
+ ret = node_to_ns_unified(node);
+ if (ns_id == ns->ns_id)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+
+ if (ret)
+ ret = ns_get_unless_inactive(ret);
+ return ret;
+}
+
+static inline struct ns_common *first_ns_common(const struct list_head *head,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry);
+ return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry);
+}
+
+static inline struct ns_common *next_ns_common(struct ns_common *ns,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry);
+ return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry);
+}
+
+static inline bool ns_common_is_head(struct ns_common *ns,
+ const struct list_head *head,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return &ns->ns_tree_node.ns_list_entry == head;
+ return &ns->ns_unified_node.ns_list_entry == head;
+}
+
+static ssize_t do_listns(struct klistns *kls)
+{
+ u64 __user *ns_ids = kls->uns_ids;
+ size_t nr_ns_ids = kls->nr_ns_ids;
+ struct ns_common *ns, *first_ns = NULL, *prev = NULL;
+ struct ns_tree_root *ns_tree = NULL;
+ const struct list_head *head;
+ u32 ns_type;
+ ssize_t ret;
+
+ if (hweight32(kls->ns_type) == 1)
+ ns_type = kls->ns_type;
+ else
+ ns_type = 0;
+
+ if (ns_type) {
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return -EINVAL;
+ }
+
+ if (kls->last_ns_id) {
+ kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type);
+ if (!kls->first_ns)
+ return -ENOENT;
+ first_ns = kls->first_ns;
+ }
+
+ ret = 0;
+ if (ns_tree)
+ head = &ns_tree->ns_list_head;
+ else
+ head = &ns_unified_root.ns_list_head;
+
+ rcu_read_lock();
+
+ if (!first_ns)
+ first_ns = first_ns_common(head, ns_tree);
+
+ for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids;
+ ns = next_ns_common(ns, ns_tree)) {
+ struct ns_common *valid;
+
+ valid = legitimize_ns(kls, ns);
+ if (!valid)
+ continue;
+
+ rcu_read_unlock();
+
+ ns_put(prev);
+ prev = valid;
+
+ if (put_user(valid->ns_id, ns_ids + ret)) {
+ ns_put(prev);
+ return -EFAULT;
+ }
+
+ nr_ns_ids--;
+ ret++;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+ ns_put(prev);
+ return ret;
+}
+
+SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req,
+ u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags)
+{
+ struct klistns klns __free(klistns_free) = {};
+ const size_t maxcount = 1000000;
+ struct ns_id_req kreq;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (unlikely(nr_ns_ids > maxcount))
+ return -EOVERFLOW;
+
+ if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids)))
+ return -EFAULT;
+
+ ret = copy_ns_id_req(req, &kreq);
+ if (ret)
+ return ret;
+
+ ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids);
+ if (ret)
+ return ret;
+
+ if (kreq.user_ns_id)
+ return do_listns_userns(&klns);
+
+ return do_listns(&klns);
+}
diff --git a/kernel/padata.c b/kernel/padata.c
index f4def028c48c..aa66d91e20f9 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -506,12 +506,6 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
padata_works_free(&works);
}
-static void __padata_list_init(struct padata_list *pd_list)
-{
- INIT_LIST_HEAD(&pd_list->list);
- spin_lock_init(&pd_list->lock);
-}
-
/* Initialize all percpu queues used by serial workers */
static void padata_init_squeues(struct parallel_data *pd)
{
@@ -521,7 +515,8 @@ static void padata_init_squeues(struct parallel_data *pd)
for_each_cpu(cpu, pd->cpumask.cbcpu) {
squeue = per_cpu_ptr(pd->squeue, cpu);
squeue->pd = pd;
- __padata_list_init(&squeue->serial);
+ INIT_LIST_HEAD(&squeue->serial.list);
+ spin_lock_init(&squeue->serial.lock);
INIT_WORK(&squeue->work, padata_serial_worker);
}
}
@@ -534,7 +529,8 @@ static void padata_init_reorder_list(struct parallel_data *pd)
for_each_cpu(cpu, pd->cpumask.pcpu) {
list = per_cpu_ptr(pd->reorder_list, cpu);
- __padata_list_init(list);
+ INIT_LIST_HEAD(&list->list);
+ spin_lock_init(&list->lock);
}
}
diff --git a/kernel/panic.c b/kernel/panic.c
index 24cc3eec1805..b2f2470af7e5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -873,13 +873,15 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
disable_trace_on_warning();
- if (file)
- pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
- raw_smp_processor_id(), current->pid, file, line,
- caller);
- else
- pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
- raw_smp_processor_id(), current->pid, caller);
+ if (file) {
+ pr_warn("WARNING: %s:%d at %pS, CPU#%d: %s/%d\n",
+ file, line, caller,
+ raw_smp_processor_id(), current->comm, current->pid);
+ } else {
+ pr_warn("WARNING: at %pS, CPU#%d: %s/%d\n",
+ caller,
+ raw_smp_processor_id(), current->comm, current->pid);
+ }
#pragma GCC diagnostic push
#ifndef __clang__
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fffec767a63..a31771bc89c1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -71,21 +71,16 @@ static int pid_max_max = PID_MAX_LIMIT;
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .ns.__ns_ref = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_pid_ns),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .ns.inum = ns_init_inum(&init_pid_ns),
-#ifdef CONFIG_PID_NS
- .ns.ops = &pidns_operations,
-#endif
.pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
- .ns.ns_type = ns_common_type(&init_pid_ns),
};
EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -117,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
int i;
+ struct pid_namespace *active_ns;
lockdep_assert_not_held(&tasklist_lock);
+ active_ns = pid->numbers[pid->level].ns;
+ ns_ref_active_put(active_ns);
+
spin_lock(&pidmap_lock);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
@@ -283,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
}
spin_unlock(&pidmap_lock);
idr_preload_end();
+ ns_ref_active_get(ns);
return pid;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 650be58d8d18..e48f5de41361 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags,
void put_pid_ns(struct pid_namespace *ns)
{
- if (ns && ns != &init_pid_ns && ns_ref_put(ns))
+ if (ns && ns_ref_put(ns))
schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 54a623680019..05337f437cca 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -202,6 +202,17 @@ config PM_WAKELOCKS_GC
depends on PM_WAKELOCKS
default y
+config PM_QOS_CPU_SYSTEM_WAKEUP
+ bool "User space interface for CPU system wakeup QoS"
+ depends on CPU_IDLE
+ help
+ Enable this to allow user space via the cpu_wakeup_latency file to
+ specify a CPU system wakeup latency limit.
+
+ This may be particularly useful for platforms supporting multiple low
+ power states for CPUs during system-wide suspend and s2idle in
+ particular.
+
config PM
bool "Device power management core functionality"
help
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 874ad834dc8d..773e2789412b 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -21,4 +21,6 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
-obj-$(CONFIG_ENERGY_MODEL) += energy_model.o
+obj-$(CONFIG_ENERGY_MODEL) += em.o
+em-y := energy_model.o
+em-$(CONFIG_NET) += em_netlink_autogen.o em_netlink.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 19c48aa5355d..a906a0ac0f9b 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -44,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list);
* no_console_suspend argument has been passed on the command line, VT
* switches will occur.
*/
-void pm_vt_switch_required(struct device *dev, bool required)
+int pm_vt_switch_required(struct device *dev, bool required)
{
struct pm_vt_switch *entry, *tmp;
+ int ret = 0;
mutex_lock(&vt_switch_mutex);
list_for_each_entry(tmp, &pm_vt_switch_list, head) {
@@ -58,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required)
}
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
+ if (!entry) {
+ ret = -ENOMEM;
goto out;
+ }
entry->required = required;
entry->dev = dev;
@@ -67,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required)
list_add(&entry->head, &pm_vt_switch_list);
out:
mutex_unlock(&vt_switch_mutex);
+ return ret;
}
EXPORT_SYMBOL(pm_vt_switch_required);
diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c
new file mode 100644
index 000000000000..4b85da138a06
--- /dev/null
+++ b/kernel/power/em_netlink.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Generic netlink for energy model.
+ *
+ * Copyright (c) 2025 Valve Corporation.
+ * Author: Changwoo Min <changwoo@igalia.com>
+ */
+
+#define pr_fmt(fmt) "energy_model: " fmt
+
+#include <linux/energy_model.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <uapi/linux/energy_model.h>
+
+#include "em_netlink.h"
+#include "em_netlink_autogen.h"
+
+#define EM_A_PD_CPUS_LEN 256
+
+/*************************** Command encoding ********************************/
+static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data)
+{
+ char cpus_buf[EM_A_PD_CPUS_LEN];
+ int *tot_msg_sz = data;
+ int msg_sz, cpus_sz;
+
+ cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb",
+ cpumask_pr_args(to_cpumask(pd->cpus)));
+
+ msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */
+ nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */
+ nla_total_size(cpus_sz); /* EM_A_PD_CPUS */
+
+ *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz));
+ return 0;
+}
+
+static int __em_nl_get_pd(struct em_perf_domain *pd, void *data)
+{
+ char cpus_buf[EM_A_PD_CPUS_LEN];
+ struct sk_buff *msg = data;
+ struct nlattr *entry;
+
+ entry = nla_nest_start(msg, EM_A_PDS_PD);
+ if (!entry)
+ goto out_cancel_nest;
+
+ if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id))
+ goto out_cancel_nest;
+
+ if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD))
+ goto out_cancel_nest;
+
+ snprintf(cpus_buf, sizeof(cpus_buf), "%*pb",
+ cpumask_pr_args(to_cpumask(pd->cpus)));
+ if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf))
+ goto out_cancel_nest;
+
+ nla_nest_end(msg, entry);
+
+ return 0;
+
+out_cancel_nest:
+ nla_nest_cancel(msg, entry);
+
+ return -EMSGSIZE;
+}
+
+int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ int cmd = info->genlhdr->cmd;
+ int ret = -EMSGSIZE, msg_sz = 0;
+
+ for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd);
+ if (!hdr)
+ goto out_free_msg;
+
+ ret = for_each_em_perf_domain(__em_nl_get_pd, msg);
+ if (ret)
+ goto out_cancel_msg;
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+out_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+out_free_msg:
+ nlmsg_free(msg);
+
+ return ret;
+}
+
+static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs)
+{
+ struct em_perf_domain *pd;
+ int id;
+
+ if (!attrs[EM_A_PD_TABLE_PD_ID])
+ return NULL;
+
+ id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]);
+ pd = em_perf_domain_get_by_id(id);
+ return pd;
+}
+
+static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd)
+{
+ int id_sz, ps_sz;
+
+ id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */
+ ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */
+ nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */
+ ps_sz *= pd->nr_perf_states;
+
+ return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz));
+}
+
+static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd)
+{
+ struct em_perf_state *table, *ps;
+ struct nlattr *entry;
+ int i;
+
+ if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id))
+ goto out_err;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd((struct em_perf_domain *)pd);
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ ps = &table[i];
+
+ entry = nla_nest_start(msg, EM_A_PD_TABLE_PS);
+ if (!entry)
+ goto out_unlock_ps;
+
+ if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE,
+ ps->performance, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY,
+ ps->frequency, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_POWER,
+ ps->power, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_COST,
+ ps->cost, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS,
+ ps->flags, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+
+ nla_nest_end(msg, entry);
+ }
+ rcu_read_unlock();
+ return 0;
+
+out_cancel_ps_nest:
+ nla_nest_cancel(msg, entry);
+out_unlock_ps:
+ rcu_read_unlock();
+out_err:
+ return -EMSGSIZE;
+}
+
+int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ int cmd = info->genlhdr->cmd;
+ int msg_sz, ret = -EMSGSIZE;
+ struct em_perf_domain *pd;
+ struct sk_buff *msg;
+ void *hdr;
+
+ pd = __em_nl_get_pd_table_id(info->attrs);
+ if (!pd)
+ return -EINVAL;
+
+ msg_sz = __em_nl_get_pd_table_size(pd);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd);
+ if (!hdr)
+ goto out_free_msg;
+
+ ret = __em_nl_get_pd_table(msg, pd);
+ if (ret)
+ goto out_free_msg;
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+out_free_msg:
+ nlmsg_free(msg);
+ return ret;
+}
+
+
+/**************************** Event encoding *********************************/
+static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type)
+{
+ struct sk_buff *msg;
+ int msg_sz, ret = -EMSGSIZE;
+ void *hdr;
+
+ if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT))
+ return;
+
+ msg_sz = __em_nl_get_pd_table_size(pd);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type);
+ if (!hdr)
+ goto out_free_msg;
+
+ ret = __em_nl_get_pd_table(msg, pd);
+ if (ret)
+ goto out_free_msg;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL);
+
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+ return;
+}
+
+void em_notify_pd_created(const struct em_perf_domain *pd)
+{
+ __em_notify_pd_table(pd, EM_CMD_PD_CREATED);
+}
+
+void em_notify_pd_updated(const struct em_perf_domain *pd)
+{
+ __em_notify_pd_table(pd, EM_CMD_PD_UPDATED);
+}
+
+static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd)
+{
+ int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */
+
+ return nlmsg_total_size(genlmsg_msg_size(id_sz));
+}
+
+void em_notify_pd_deleted(const struct em_perf_domain *pd)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ int msg_sz;
+
+ if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT))
+ return;
+
+ msg_sz = __em_notify_pd_deleted_size(pd);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED);
+ if (!hdr)
+ goto out_free_msg;
+
+ if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) {
+ goto out_free_msg;
+ }
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL);
+
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+ return;
+}
+
+/**************************** Initialization *********************************/
+static int __init em_netlink_init(void)
+{
+ return genl_register_family(&em_nl_family);
+}
+postcore_initcall(em_netlink_init);
diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h
new file mode 100644
index 000000000000..583d7f1c3939
--- /dev/null
+++ b/kernel/power/em_netlink.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Generic netlink for energy model.
+ *
+ * Copyright (c) 2025 Valve Corporation.
+ * Author: Changwoo Min <changwoo@igalia.com>
+ */
+#ifndef _EM_NETLINK_H
+#define _EM_NETLINK_H
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+ void *data);
+struct em_perf_domain *em_perf_domain_get_by_id(int id);
+void em_notify_pd_created(const struct em_perf_domain *pd);
+void em_notify_pd_deleted(const struct em_perf_domain *pd);
+void em_notify_pd_updated(const struct em_perf_domain *pd);
+#else
+static inline
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+ void *data)
+{
+ return -EINVAL;
+}
+static inline
+struct em_perf_domain *em_perf_domain_get_by_id(int id)
+{
+ return NULL;
+}
+
+static inline void em_notify_pd_created(const struct em_perf_domain *pd) {}
+
+static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {}
+
+static inline void em_notify_pd_updated(const struct em_perf_domain *pd) {}
+#endif
+
+#endif /* _EM_NETLINK_H */
diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c
new file mode 100644
index 000000000000..a7a09ab1d1c2
--- /dev/null
+++ b/kernel/power/em_netlink_autogen.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/em.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "em_netlink_autogen.h"
+
+#include <uapi/linux/energy_model.h>
+
+/* EM_CMD_GET_PD_TABLE - do */
+static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = {
+ [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, },
+};
+
+/* Ops table for em */
+static const struct genl_split_ops em_nl_ops[] = {
+ {
+ .cmd = EM_CMD_GET_PDS,
+ .doit = em_nl_get_pds_doit,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = EM_CMD_GET_PD_TABLE,
+ .doit = em_nl_get_pd_table_doit,
+ .policy = em_get_pd_table_nl_policy,
+ .maxattr = EM_A_PD_TABLE_PD_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
+
+static const struct genl_multicast_group em_nl_mcgrps[] = {
+ [EM_NLGRP_EVENT] = { "event", },
+};
+
+struct genl_family em_nl_family __ro_after_init = {
+ .name = EM_FAMILY_NAME,
+ .version = EM_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = em_nl_ops,
+ .n_split_ops = ARRAY_SIZE(em_nl_ops),
+ .mcgrps = em_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps),
+};
diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h
new file mode 100644
index 000000000000..78ce609641f1
--- /dev/null
+++ b/kernel/power/em_netlink_autogen.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/em.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_EM_GEN_H
+#define _LINUX_EM_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/energy_model.h>
+
+int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info);
+int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+ EM_NLGRP_EVENT,
+};
+
+extern struct genl_family em_nl_family;
+
+#endif /* _LINUX_EM_GEN_H */
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 5f17d2e8e954..11af9f64aa82 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -17,12 +17,24 @@
#include <linux/sched/topology.h>
#include <linux/slab.h>
+#include "em_netlink.h"
+
/*
* Mutex serializing the registrations of performance domains and letting
* callbacks defined by drivers sleep.
*/
static DEFINE_MUTEX(em_pd_mutex);
+/*
+ * Manage performance domains with IDs. One can iterate the performance domains
+ * through the list and pick one with their associated ID. The mutex serializes
+ * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be
+ * taken to avoid potential deadlock.
+ */
+static DEFINE_IDA(em_pd_ida);
+static LIST_HEAD(em_pd_list);
+static DEFINE_MUTEX(em_pd_list_mutex);
+
static void em_cpufreq_update_efficiencies(struct device *dev,
struct em_perf_state *table);
static void em_check_capacity_update(void);
@@ -116,6 +128,16 @@ static int em_debug_flags_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
+static int em_debug_id_show(struct seq_file *s, void *unused)
+{
+ struct em_perf_domain *pd = s->private;
+
+ seq_printf(s, "%d\n", pd->id);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_id);
+
static void em_debug_create_pd(struct device *dev)
{
struct em_dbg_info *em_dbg;
@@ -132,6 +154,8 @@ static void em_debug_create_pd(struct device *dev)
debugfs_create_file("flags", 0444, d, dev->em_pd,
&em_debug_flags_fops);
+ debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops);
+
em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states,
sizeof(*em_dbg), GFP_KERNEL);
if (!em_dbg)
@@ -328,6 +352,8 @@ int em_dev_update_perf_domain(struct device *dev,
em_table_free(old_table);
mutex_unlock(&em_pd_mutex);
+
+ em_notify_pd_updated(pd);
return 0;
}
EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
@@ -396,7 +422,7 @@ static int em_create_pd(struct device *dev, int nr_states,
struct em_perf_table *em_table;
struct em_perf_domain *pd;
struct device *cpu_dev;
- int cpu, ret, num_cpus;
+ int cpu, ret, num_cpus, id;
if (_is_cpu_device(dev)) {
num_cpus = cpumask_weight(cpus);
@@ -420,6 +446,13 @@ static int em_create_pd(struct device *dev, int nr_states,
pd->nr_perf_states = nr_states;
+ INIT_LIST_HEAD(&pd->node);
+
+ id = ida_alloc(&em_pd_ida, GFP_KERNEL);
+ if (id < 0)
+ return -ENOMEM;
+ pd->id = id;
+
em_table = em_table_alloc(pd);
if (!em_table)
goto free_pd;
@@ -444,6 +477,7 @@ free_pd_table:
kfree(em_table);
free_pd:
kfree(pd);
+ ida_free(&em_pd_ida, id);
return -EINVAL;
}
@@ -659,8 +693,16 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
unlock:
mutex_unlock(&em_pd_mutex);
+ if (ret)
+ return ret;
- return ret;
+ mutex_lock(&em_pd_list_mutex);
+ list_add_tail(&dev->em_pd->node, &em_pd_list);
+ mutex_unlock(&em_pd_list_mutex);
+
+ em_notify_pd_created(dev->em_pd);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
@@ -678,6 +720,12 @@ void em_dev_unregister_perf_domain(struct device *dev)
if (_is_cpu_device(dev))
return;
+ mutex_lock(&em_pd_list_mutex);
+ list_del_init(&dev->em_pd->node);
+ mutex_unlock(&em_pd_list_mutex);
+
+ em_notify_pd_deleted(dev->em_pd);
+
/*
* The mutex separates all register/unregister requests and protects
* from potential clean-up/setup issues in the debugfs directories.
@@ -689,6 +737,8 @@ void em_dev_unregister_perf_domain(struct device *dev)
em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
lockdep_is_held(&em_pd_mutex)));
+ ida_free(&em_pd_ida, dev->em_pd->id);
+
kfree(dev->em_pd);
dev->em_pd = NULL;
mutex_unlock(&em_pd_mutex);
@@ -958,3 +1008,39 @@ void em_rebuild_sched_domains(void)
*/
schedule_work(&rebuild_sd_work);
}
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+ void *data)
+{
+ struct em_perf_domain *pd;
+
+ lockdep_assert_not_held(&em_pd_mutex);
+ guard(mutex)(&em_pd_list_mutex);
+
+ list_for_each_entry(pd, &em_pd_list, node) {
+ int ret;
+
+ ret = cb(pd, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+struct em_perf_domain *em_perf_domain_get_by_id(int id)
+{
+ struct em_perf_domain *pd;
+
+ lockdep_assert_not_held(&em_pd_mutex);
+ guard(mutex)(&em_pd_list_mutex);
+
+ list_for_each_entry(pd, &em_pd_list, node) {
+ if (pd->id == id)
+ return pd;
+ }
+
+ return NULL;
+}
+#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 14e85ff23551..af8d07bafe02 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -706,7 +706,6 @@ static void power_down(void)
#ifdef CONFIG_SUSPEND
if (hibernation_mode == HIBERNATION_SUSPEND) {
- pm_restore_gfp_mask();
error = suspend_devices_and_enter(mem_sleep_current);
if (!error)
goto exit;
@@ -746,9 +745,6 @@ static void power_down(void)
cpu_relax();
exit:
- /* Match the pm_restore_gfp_mask() call in hibernate(). */
- pm_restrict_gfp_mask();
-
/* Restore swap signature. */
error = swsusp_unmark();
if (error)
@@ -824,9 +820,11 @@ int hibernate(void)
if (error)
goto Restore;
- ksys_sync_helper();
- if (filesystem_freeze_enabled)
- filesystems_freeze();
+ error = pm_sleep_fs_sync();
+ if (error)
+ goto Notify;
+
+ filesystems_freeze(filesystem_freeze_enabled);
error = freeze_processes();
if (error)
@@ -896,6 +894,7 @@ int hibernate(void)
freezer_test_done = false;
Exit:
filesystems_thaw();
+ Notify:
pm_notifier_call_chain(PM_POST_HIBERNATION);
Restore:
pm_restore_console();
@@ -932,8 +931,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data)
if (error)
goto restore;
- if (filesystem_freeze_enabled)
- filesystems_freeze();
+ filesystems_freeze(filesystem_freeze_enabled);
error = freeze_processes();
if (error)
@@ -1083,8 +1081,7 @@ static int software_resume(void)
if (error)
goto Restore;
- if (filesystem_freeze_enabled)
- filesystems_freeze();
+ filesystems_freeze(filesystem_freeze_enabled);
pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3cf2d7e72567..03b2c5495c77 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -18,6 +18,8 @@
#include <linux/suspend.h>
#include <linux/syscalls.h>
#include <linux/pm_runtime.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
#include "power.h"
@@ -31,23 +33,35 @@
* held, unless the suspend/hibernate code is guaranteed not to run in parallel
* with that modification).
*/
+static unsigned int saved_gfp_count;
static gfp_t saved_gfp_mask;
void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
- if (saved_gfp_mask) {
- gfp_allowed_mask = saved_gfp_mask;
- saved_gfp_mask = 0;
- }
+
+ if (WARN_ON(!saved_gfp_count) || --saved_gfp_count)
+ return;
+
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+
+ pm_pr_dbg("GFP mask restored\n");
}
void pm_restrict_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
- WARN_ON(saved_gfp_mask);
+
+ if (saved_gfp_count++) {
+ WARN_ON((saved_gfp_mask & ~(__GFP_IO | __GFP_FS)) != gfp_allowed_mask);
+ return;
+ }
+
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
+
+ pm_pr_dbg("GFP mask restricted\n");
}
unsigned int lock_system_sleep(void)
@@ -80,6 +94,61 @@ void ksys_sync_helper(void)
}
EXPORT_SYMBOL_GPL(ksys_sync_helper);
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+/* Wakeup events handling resolution while syncing file systems in jiffies */
+#define PM_FS_SYNC_WAKEUP_RESOLUTION 5
+
+static atomic_t pm_fs_sync_count = ATOMIC_INIT(0);
+static struct workqueue_struct *pm_fs_sync_wq;
+static DECLARE_WAIT_QUEUE_HEAD(pm_fs_sync_wait);
+
+static bool pm_fs_sync_completed(void)
+{
+ return atomic_read(&pm_fs_sync_count) == 0;
+}
+
+static void pm_fs_sync_work_fn(struct work_struct *work)
+{
+ ksys_sync_helper();
+
+ if (atomic_dec_and_test(&pm_fs_sync_count))
+ wake_up(&pm_fs_sync_wait);
+}
+static DECLARE_WORK(pm_fs_sync_work, pm_fs_sync_work_fn);
+
+/**
+ * pm_sleep_fs_sync() - Sync file systems in an interruptible way
+ *
+ * Return: 0 on successful file system sync, or -EBUSY if the file system sync
+ * was aborted.
+ */
+int pm_sleep_fs_sync(void)
+{
+ pm_wakeup_clear(0);
+
+ /*
+ * Take back-to-back sleeps into account by queuing a subsequent fs sync
+ * only if the previous fs sync is running or is not queued. Multiple fs
+ * syncs increase the likelihood of saving the latest files immediately
+ * before sleep.
+ */
+ if (!work_pending(&pm_fs_sync_work)) {
+ atomic_inc(&pm_fs_sync_count);
+ queue_work(pm_fs_sync_wq, &pm_fs_sync_work);
+ }
+
+ while (!pm_fs_sync_completed()) {
+ if (pm_wakeup_pending())
+ return -EBUSY;
+
+ wait_event_timeout(pm_fs_sync_wait, pm_fs_sync_completed(),
+ PM_FS_SYNC_WAKEUP_RESOLUTION);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */
+
/* Routines for PM-transition notifications */
static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
@@ -219,10 +288,10 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
power_attr(mem_sleep);
/*
- * sync_on_suspend: invoke ksys_sync_helper() before suspend.
+ * sync_on_suspend: Sync file systems before suspend.
*
- * show() returns whether ksys_sync_helper() is invoked before suspend.
- * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it.
+ * show() returns whether file systems sync before suspend is enabled.
+ * store() accepts 0 or 1. 0 disables file systems sync and 1 enables it.
*/
bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC);
@@ -1054,16 +1123,26 @@ static const struct attribute_group *attr_groups[] = {
struct workqueue_struct *pm_wq;
EXPORT_SYMBOL_GPL(pm_wq);
-static int __init pm_start_workqueue(void)
+static int __init pm_start_workqueues(void)
{
- pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
+ pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0);
+ if (!pm_wq)
+ return -ENOMEM;
- return pm_wq ? 0 : -ENOMEM;
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+ pm_fs_sync_wq = alloc_ordered_workqueue("pm_fs_sync", 0);
+ if (!pm_fs_sync_wq) {
+ destroy_workqueue(pm_wq);
+ return -ENOMEM;
+ }
+#endif
+
+ return 0;
}
static int __init pm_init(void)
{
- int error = pm_start_workqueue();
+ int error = pm_start_workqueues();
if (error)
return error;
hibernate_image_size_init();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7ccd709af93f..75b63843886e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -19,6 +19,7 @@ struct swsusp_info {
} __aligned(PAGE_SIZE);
#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+extern int pm_sleep_fs_sync(void);
extern bool filesystem_freeze_enabled;
#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ff68ebaa1e0..dc0dfc349f22 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -132,6 +132,7 @@ int freeze_processes(void)
if (!pm_freezing)
static_branch_inc(&freezer_active);
+ pm_wakeup_clear(0);
pm_freezing = true;
error = try_to_freeze_tasks(true);
if (!error)
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 4244b069442e..f7d8064e9adc 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -415,6 +415,105 @@ static struct miscdevice cpu_latency_qos_miscdev = {
.fops = &cpu_latency_qos_fops,
};
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+/* The CPU system wakeup latency QoS. */
+static struct pm_qos_constraints cpu_wakeup_latency_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list),
+ .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+ .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+ .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+ .type = PM_QOS_MIN,
+};
+
+/**
+ * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit.
+ *
+ * Returns the current CPU system wakeup latency QoS limit that may have been
+ * requested by user space.
+ */
+s32 cpu_wakeup_latency_qos_limit(void)
+{
+ return pm_qos_read_value(&cpu_wakeup_latency_constraints);
+}
+
+static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp)
+{
+ struct pm_qos_request *req;
+
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->qos = &cpu_wakeup_latency_constraints;
+ pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ,
+ PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
+ filp->private_data = req;
+
+ return 0;
+}
+
+static int cpu_wakeup_latency_qos_release(struct inode *inode,
+ struct file *filp)
+{
+ struct pm_qos_request *req = filp->private_data;
+
+ filp->private_data = NULL;
+ pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ,
+ PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
+ kfree(req);
+
+ return 0;
+}
+
+static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints);
+
+ return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
+
+static ssize_t cpu_wakeup_latency_qos_write(struct file *filp,
+ const char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ struct pm_qos_request *req = filp->private_data;
+ s32 value;
+
+ if (count == sizeof(s32)) {
+ if (copy_from_user(&value, buf, sizeof(s32)))
+ return -EFAULT;
+ } else {
+ int ret;
+
+ ret = kstrtos32_from_user(buf, count, 16, &value);
+ if (ret)
+ return ret;
+ }
+
+ if (value < 0)
+ return -EINVAL;
+
+ pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value);
+
+ return count;
+}
+
+static const struct file_operations cpu_wakeup_latency_qos_fops = {
+ .open = cpu_wakeup_latency_qos_open,
+ .release = cpu_wakeup_latency_qos_release,
+ .read = cpu_wakeup_latency_qos_read,
+ .write = cpu_wakeup_latency_qos_write,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice cpu_wakeup_latency_qos_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "cpu_wakeup_latency",
+ .fops = &cpu_wakeup_latency_qos_fops,
+};
+#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */
+
static int __init cpu_latency_qos_init(void)
{
int ret;
@@ -424,6 +523,13 @@ static int __init cpu_latency_qos_init(void)
pr_err("%s: %s setup failed\n", __func__,
cpu_latency_qos_miscdev.name);
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+ ret = misc_register(&cpu_wakeup_latency_qos_miscdev);
+ if (ret < 0)
+ pr_err("%s: %s setup failed\n", __func__,
+ cpu_wakeup_latency_qos_miscdev.name);
+#endif
+
return ret;
}
late_initcall(cpu_latency_qos_init);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 645f42e40478..0a946932d5c1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2110,22 +2110,20 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- pr_info("Creating image:\n");
+ pm_deferred_pr_dbg("Creating image\n");
drain_local_pages(NULL);
nr_pages = count_data_pages();
nr_highmem = count_highmem_pages();
- pr_info("Need to copy %u pages\n", nr_pages + nr_highmem);
+ pm_deferred_pr_dbg("Need to copy %u pages\n", nr_pages + nr_highmem);
if (!enough_free_mem(nr_pages, nr_highmem)) {
- pr_err("Not enough free memory\n");
+ pm_deferred_pr_dbg("Not enough free memory for image creation\n");
return -ENOMEM;
}
- if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
- pr_err("Memory allocation failed\n");
+ if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem))
return -ENOMEM;
- }
/*
* During allocating of suspend pagedir, new cold pages may appear.
@@ -2144,7 +2142,8 @@ asmlinkage __visible int swsusp_save(void)
nr_zero_pages = nr_pages - nr_copy_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages);
+ pm_deferred_pr_dbg("Image created (%d pages copied, %d zero pages)\n",
+ nr_copy_pages, nr_zero_pages);
return 0;
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4bb4686c1c08..2da4482bb6eb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -344,10 +344,14 @@ MODULE_PARM_DESC(pm_test_delay,
static int suspend_test(int level)
{
#ifdef CONFIG_PM_DEBUG
+ int i;
+
if (pm_test_level == level) {
pr_info("suspend debug: Waiting for %d second(s).\n",
pm_test_delay);
- mdelay(pm_test_delay * 1000);
+ for (i = 0; i < pm_test_delay && !pm_wakeup_pending(); i++)
+ msleep(1000);
+
return 1;
}
#endif /* !CONFIG_PM_DEBUG */
@@ -375,8 +379,7 @@ static int suspend_prepare(suspend_state_t state)
if (error)
goto Restore;
- if (filesystem_freeze_enabled)
- filesystems_freeze();
+ filesystems_freeze(filesystem_freeze_enabled);
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
trace_suspend_resume(TPS("freeze_processes"), 0, false);
@@ -590,12 +593,15 @@ static int enter_state(suspend_state_t state)
if (sync_on_suspend_enabled) {
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- ksys_sync_helper();
+
+ error = pm_sleep_fs_sync();
+ if (error)
+ goto Unlock;
+
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
}
pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
- pm_wakeup_clear(0);
pm_suspend_clear_flags();
error = suspend_prepare(state);
if (error)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 0beff7eeaaba..33a186373bef 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -46,19 +46,18 @@ static bool clean_pages_on_read;
static bool clean_pages_on_decompress;
/*
- * The swap map is a data structure used for keeping track of each page
- * written to a swap partition. It consists of many swap_map_page
- * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
- * These structures are stored on the swap and linked together with the
- * help of the .next_swap member.
+ * The swap map is a data structure used for keeping track of each page
+ * written to a swap partition. It consists of many swap_map_page structures
+ * that contain each an array of MAP_PAGE_ENTRIES swap entries. These
+ * structures are stored on the swap and linked together with the help of the
+ * .next_swap member.
*
- * The swap map is created during suspend. The swap map pages are
- * allocated and populated one at a time, so we only need one memory
- * page to set up the entire structure.
+ * The swap map is created during suspend. The swap map pages are allocated and
+ * populated one at a time, so we only need one memory page to set up the entire
+ * structure.
*
- * During resume we pick up all swap_map_page structures into a list.
+ * During resume we pick up all swap_map_page structures into a list.
*/
-
#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
/*
@@ -89,10 +88,8 @@ struct swap_map_page_list {
};
/*
- * The swap_map_handle structure is used for handling swap in
- * a file-alike way
+ * The swap_map_handle structure is used for handling swap in a file-alike way.
*/
-
struct swap_map_handle {
struct swap_map_page *cur;
struct swap_map_page_list *maps;
@@ -117,10 +114,9 @@ struct swsusp_header {
static struct swsusp_header *swsusp_header;
/*
- * The following functions are used for tracing the allocated
- * swap pages, so that they can be freed in case of an error.
+ * The following functions are used for tracing the allocated swap pages, so
+ * that they can be freed in case of an error.
*/
-
struct swsusp_extent {
struct rb_node node;
unsigned long start;
@@ -170,15 +166,14 @@ static int swsusp_extents_insert(unsigned long swap_offset)
return 0;
}
-/*
- * alloc_swapdev_block - allocate a swap page and register that it has
- * been allocated, so that it can be freed in case of an error.
- */
-
sector_t alloc_swapdev_block(int swap)
{
unsigned long offset;
+ /*
+ * Allocate a swap page and register that it has been allocated, so that
+ * it can be freed in case of an error.
+ */
offset = swp_offset(get_swap_page_of_type(swap));
if (offset) {
if (swsusp_extents_insert(offset))
@@ -189,16 +184,14 @@ sector_t alloc_swapdev_block(int swap)
return 0;
}
-/*
- * free_all_swap_pages - free swap pages allocated for saving image data.
- * It also frees the extents used to register which swap entries had been
- * allocated.
- */
-
void free_all_swap_pages(int swap)
{
struct rb_node *node;
+ /*
+ * Free swap pages allocated for saving image data. It also frees the
+ * extents used to register which swap entries had been allocated.
+ */
while ((node = swsusp_extents.rb_node)) {
struct swsusp_extent *ext;
@@ -303,6 +296,7 @@ static int hib_wait_io(struct hib_bio_batch *hb)
/*
* Saving part
*/
+
static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
@@ -336,16 +330,14 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
*/
unsigned int swsusp_header_flags;
-/**
- * swsusp_swap_check - check if the resume device is a swap device
- * and get its index (if so)
- *
- * This is called before saving image
- */
static int swsusp_swap_check(void)
{
int res;
+ /*
+ * Check if the resume device is a swap device and get its index (if so).
+ * This is called before saving the image.
+ */
if (swsusp_resume_device)
res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
else
@@ -362,13 +354,6 @@ static int swsusp_swap_check(void)
return 0;
}
-/**
- * write_page - Write one page to given swap location.
- * @buf: Address we're writing.
- * @offset: Offset of the swap page we're writing to.
- * @hb: bio completion batch
- */
-
static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
{
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
@@ -519,17 +504,14 @@ static int swap_writer_finish(struct swap_map_handle *handle,
CMP_HEADER, PAGE_SIZE)
#define CMP_SIZE (CMP_PAGES * PAGE_SIZE)
-/* Maximum number of threads for compression/decompression. */
-#define CMP_THREADS 3
+/* Default number of threads for compression/decompression. */
+#define CMP_THREADS 3
+static unsigned int hibernate_compression_threads = CMP_THREADS;
/* Minimum/maximum number of pages for read buffering. */
#define CMP_MIN_RD_PAGES 1024
#define CMP_MAX_RD_PAGES 8192
-/**
- * save_image - save the suspend image data
- */
-
static int save_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
unsigned int nr_to_write)
@@ -585,13 +567,48 @@ struct crc_data {
wait_queue_head_t go; /* start crc update */
wait_queue_head_t done; /* crc update done */
u32 *crc32; /* points to handle's crc32 */
- size_t *unc_len[CMP_THREADS]; /* uncompressed lengths */
- unsigned char *unc[CMP_THREADS]; /* uncompressed data */
+ size_t **unc_len; /* uncompressed lengths */
+ unsigned char **unc; /* uncompressed data */
};
-/*
- * CRC32 update function that runs in its own thread.
- */
+static struct crc_data *alloc_crc_data(int nr_threads)
+{
+ struct crc_data *crc;
+
+ crc = kzalloc(sizeof(*crc), GFP_KERNEL);
+ if (!crc)
+ return NULL;
+
+ crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL);
+ if (!crc->unc)
+ goto err_free_crc;
+
+ crc->unc_len = kcalloc(nr_threads, sizeof(*crc->unc_len), GFP_KERNEL);
+ if (!crc->unc_len)
+ goto err_free_unc;
+
+ return crc;
+
+err_free_unc:
+ kfree(crc->unc);
+err_free_crc:
+ kfree(crc);
+ return NULL;
+}
+
+static void free_crc_data(struct crc_data *crc)
+{
+ if (!crc)
+ return;
+
+ if (crc->thr)
+ kthread_stop(crc->thr);
+
+ kfree(crc->unc_len);
+ kfree(crc->unc);
+ kfree(crc);
+}
+
static int crc32_threadfn(void *data)
{
struct crc_data *d = data;
@@ -616,6 +633,7 @@ static int crc32_threadfn(void *data)
}
return 0;
}
+
/*
* Structure used for data compression.
*/
@@ -635,11 +653,8 @@ struct cmp_data {
};
/* Indicates the image size after compression */
-static atomic_t compressed_size = ATOMIC_INIT(0);
+static atomic64_t compressed_size = ATOMIC_INIT(0);
-/*
- * Compression function that runs in its own thread.
- */
static int compress_threadfn(void *data)
{
struct cmp_data *d = data;
@@ -664,19 +679,13 @@ static int compress_threadfn(void *data)
d->ret = crypto_acomp_compress(d->cr);
d->cmp_len = d->cr->dlen;
- atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len);
+ atomic64_add(d->cmp_len, &compressed_size);
atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
}
-/**
- * save_compressed_image - Save the suspend image data after compression.
- * @handle: Swap map handle to use for saving the image.
- * @snapshot: Image to read data from.
- * @nr_to_write: Number of pages to save.
- */
static int save_compressed_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
unsigned int nr_to_write)
@@ -689,21 +698,21 @@ static int save_compressed_image(struct swap_map_handle *handle,
ktime_t start;
ktime_t stop;
size_t off;
- unsigned thr, run_threads, nr_threads;
+ unsigned int thr, run_threads, nr_threads;
unsigned char *page = NULL;
struct cmp_data *data = NULL;
struct crc_data *crc = NULL;
hib_init_batch(&hb);
- atomic_set(&compressed_size, 0);
+ atomic64_set(&compressed_size, 0);
/*
* We'll limit the number of threads for compression to limit memory
* footprint.
*/
nr_threads = num_online_cpus() - 1;
- nr_threads = clamp_val(nr_threads, 1, CMP_THREADS);
+ nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads);
page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH);
if (!page) {
@@ -719,7 +728,7 @@ static int save_compressed_image(struct swap_map_handle *handle,
goto out_clean;
}
- crc = kzalloc(sizeof(*crc), GFP_KERNEL);
+ crc = alloc_crc_data(nr_threads);
if (!crc) {
pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
@@ -877,19 +886,18 @@ out_finish:
stop = ktime_get();
if (!ret)
ret = err2;
- if (!ret)
+ if (!ret) {
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
+ pr_info("Image size after compression: %lld kbytes\n",
+ (atomic64_read(&compressed_size) / 1024));
pr_info("Image saving done\n");
- swsusp_show_speed(start, stop, nr_to_write, "Wrote");
- pr_info("Image size after compression: %d kbytes\n",
- (atomic_read(&compressed_size) / 1024));
+ } else {
+ pr_err("Image saving failed: %d\n", ret);
+ }
out_clean:
hib_finish_batch(&hb);
- if (crc) {
- if (crc->thr)
- kthread_stop(crc->thr);
- kfree(crc);
- }
+ free_crc_data(crc);
if (data) {
for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
@@ -899,18 +907,12 @@ out_clean:
}
vfree(data);
}
- if (page) free_page((unsigned long)page);
+ if (page)
+ free_page((unsigned long)page);
return ret;
}
-/**
- * enough_swap - Make sure we have enough swap to save the image.
- *
- * Returns TRUE or FALSE after checking the total amount of swap
- * space available from the resume partition.
- */
-
static int enough_swap(unsigned int nr_pages)
{
unsigned int free_swap = count_swap_pages(root_swap, 1);
@@ -923,15 +925,16 @@ static int enough_swap(unsigned int nr_pages)
}
/**
- * swsusp_write - Write entire image and metadata.
- * @flags: flags to pass to the "boot" kernel in the image header
+ * swsusp_write - Write entire image and metadata.
+ * @flags: flags to pass to the "boot" kernel in the image header
+ *
+ * It is important _NOT_ to umount filesystems at this point. We want them
+ * synced (in case something goes wrong) but we DO not want to mark filesystem
+ * clean: it is not. (And it does not matter, if we resume correctly, we'll mark
+ * system clean, anyway.)
*
- * It is important _NOT_ to umount filesystems at this point. We want
- * them synced (in case something goes wrong) but we DO not want to mark
- * filesystem clean: it is not. (And it does not matter, if we resume
- * correctly, we'll mark system clean, anyway.)
+ * Return: 0 on success, negative error code on failure.
*/
-
int swsusp_write(unsigned int flags)
{
struct swap_map_handle handle;
@@ -974,8 +977,8 @@ out_finish:
}
/*
- * The following functions allow us to read data using a swap map
- * in a file-like way.
+ * The following functions allow us to read data using a swap map in a file-like
+ * way.
*/
static void release_swap_reader(struct swap_map_handle *handle)
@@ -1077,12 +1080,6 @@ static int swap_reader_finish(struct swap_map_handle *handle)
return 0;
}
-/**
- * load_image - load the image using the swap map handle
- * @handle and the snapshot handle @snapshot
- * (assume there are @nr_pages pages to load)
- */
-
static int load_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
unsigned int nr_to_read)
@@ -1153,9 +1150,6 @@ struct dec_data {
unsigned char cmp[CMP_SIZE]; /* compressed buffer */
};
-/*
- * Decompression function that runs in its own thread.
- */
static int decompress_threadfn(void *data)
{
struct dec_data *d = data;
@@ -1190,12 +1184,6 @@ static int decompress_threadfn(void *data)
return 0;
}
-/**
- * load_compressed_image - Load compressed image data and decompress it.
- * @handle: Swap map handle to use for loading data.
- * @snapshot: Image to copy uncompressed data into.
- * @nr_to_read: Number of pages to load.
- */
static int load_compressed_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
unsigned int nr_to_read)
@@ -1223,7 +1211,7 @@ static int load_compressed_image(struct swap_map_handle *handle,
* footprint.
*/
nr_threads = num_online_cpus() - 1;
- nr_threads = clamp_val(nr_threads, 1, CMP_THREADS);
+ nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads);
page = vmalloc_array(CMP_MAX_RD_PAGES, sizeof(*page));
if (!page) {
@@ -1239,7 +1227,7 @@ static int load_compressed_image(struct swap_map_handle *handle,
goto out_clean;
}
- crc = kzalloc(sizeof(*crc), GFP_KERNEL);
+ crc = alloc_crc_data(nr_threads);
if (!crc) {
pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
@@ -1506,11 +1494,7 @@ out_clean:
hib_finish_batch(&hb);
for (i = 0; i < ring_size; i++)
free_page((unsigned long)page[i]);
- if (crc) {
- if (crc->thr)
- kthread_stop(crc->thr);
- kfree(crc);
- }
+ free_crc_data(crc);
if (data) {
for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
@@ -1529,8 +1513,9 @@ out_clean:
* swsusp_read - read the hibernation image.
* @flags_p: flags passed by the "frozen" kernel in the image header should
* be written into this memory location
+ *
+ * Return: 0 on success, negative error code on failure.
*/
-
int swsusp_read(unsigned int *flags_p)
{
int error;
@@ -1567,8 +1552,9 @@ static void *swsusp_holder;
/**
* swsusp_check - Open the resume device and check for the swsusp signature.
* @exclusive: Open the resume device exclusively.
+ *
+ * Return: 0 if a valid image is found, negative error code otherwise.
*/
-
int swsusp_check(bool exclusive)
{
void *holder = exclusive ? &swsusp_holder : NULL;
@@ -1618,7 +1604,6 @@ put:
/**
* swsusp_close - close resume device.
*/
-
void swsusp_close(void)
{
if (IS_ERR(hib_resume_bdev_file)) {
@@ -1630,9 +1615,10 @@ void swsusp_close(void)
}
/**
- * swsusp_unmark - Unmark swsusp signature in the resume device
+ * swsusp_unmark - Unmark swsusp signature in the resume device
+ *
+ * Return: 0 on success, negative error code on failure.
*/
-
#ifdef CONFIG_SUSPEND
int swsusp_unmark(void)
{
@@ -1658,8 +1644,46 @@ int swsusp_unmark(void)
}
#endif
+static ssize_t hibernate_compression_threads_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", hibernate_compression_threads);
+}
+
+static ssize_t hibernate_compression_threads_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 0, &val))
+ return -EINVAL;
+
+ if (val < 1)
+ return -EINVAL;
+
+ hibernate_compression_threads = val;
+ return n;
+}
+power_attr(hibernate_compression_threads);
+
+static struct attribute *g[] = {
+ &hibernate_compression_threads_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group attr_group = {
+ .attrs = g,
+};
+
static int __init swsusp_header_init(void)
{
+ int error;
+
+ error = sysfs_create_group(power_kobj, &attr_group);
+ if (error)
+ return -ENOMEM;
+
swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
if (!swsusp_header)
panic("Could not allocate memory for swsusp_header\n");
@@ -1667,3 +1691,19 @@ static int __init swsusp_header_init(void)
}
core_initcall(swsusp_header_init);
+
+static int __init hibernate_compression_threads_setup(char *str)
+{
+ int rc = kstrtouint(str, 0, &hibernate_compression_threads);
+
+ if (rc)
+ return rc;
+
+ if (hibernate_compression_threads < 1)
+ hibernate_compression_threads = CMP_THREADS;
+
+ return 1;
+
+}
+
+__setup("hibernate_compression_threads=", hibernate_compression_threads_setup);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3f9e3efb9f6e..4401cfe26e5c 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -278,7 +278,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
if (data->frozen)
break;
- ksys_sync_helper();
+ error = pm_sleep_fs_sync();
+ if (error)
+ break;
error = freeze_processes();
if (error)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 75a84efad40f..392ec2f75f01 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
unsigned long size, void __user *data)
{
struct ptrace_rseq_configuration conf = {
- .rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
- .rseq_abi_size = task->rseq_len,
- .signature = task->rseq_sig,
+ .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr,
+ .rseq_abi_size = task->rseq.len,
+ .signature = task->rseq.sig,
.flags = 0,
};
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c1ebfd51768b..585cade21010 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -70,12 +70,10 @@ void rcu_qs(void)
*/
void rcu_sched_clock_irq(int user)
{
- if (user) {
+ if (user)
rcu_qs();
- } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
- set_tsk_need_resched(current);
- set_preempt_need_resched();
- }
+ else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail)
+ set_need_resched_current();
}
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8293bae1dec1..85b82a7007b9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2696,10 +2696,8 @@ void rcu_sched_clock_irq(int user)
/* The load-acquire pairs with the store-release setting to true. */
if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
/* Idle and userspace execution already are quiescent states. */
- if (!rcu_is_cpu_rrupt_from_idle() && !user) {
- set_tsk_need_resched(current);
- set_preempt_need_resched();
- }
+ if (!rcu_is_cpu_rrupt_from_idle() && !user)
+ set_need_resched_current();
__this_cpu_write(rcu_data.rcu_urgent_qs, false);
}
rcu_flavor_sched_clock_irq(user);
@@ -2824,7 +2822,6 @@ static void strict_work_handler(struct work_struct *work)
/* Perform RCU core processing work for the current CPU. */
static __latent_entropy void rcu_core(void)
{
- unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
@@ -2837,8 +2834,8 @@ static __latent_entropy void rcu_core(void)
if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
rcu_preempt_deferred_qs(current);
} else if (rcu_preempt_need_deferred_qs(current)) {
- set_tsk_need_resched(current);
- set_preempt_need_resched();
+ guard(irqsave)();
+ set_need_resched_current();
}
/* Update RCU state based on any recent quiescent states. */
@@ -2847,10 +2844,9 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) {
- local_irq_save(flags);
+ guard(irqsave)();
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
- local_irq_restore(flags);
}
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6058a734090c..96c49c56fc14 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -729,8 +729,7 @@ static void rcu_exp_need_qs(void)
__this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
/* Store .exp before .rcu_urgent_qs. */
smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
- set_tsk_need_resched(current);
- set_preempt_need_resched();
+ set_need_resched_current();
}
#ifdef CONFIG_PREEMPT_RCU
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d85763336b3c..dbe2d02be824 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -753,8 +753,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
// Also if no expediting and no possible deboosting,
// slow is OK. Plus nohz_full CPUs eventually get
// tick enabled.
- set_tsk_need_resched(current);
- set_preempt_need_resched();
+ set_need_resched_current();
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
cpu_online(rdp->cpu)) {
@@ -813,10 +812,8 @@ static void rcu_flavor_sched_clock_irq(int user)
if (rcu_preempt_depth() > 0 ||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
/* No QS, force context switch if deferred. */
- if (rcu_preempt_need_deferred_qs(t)) {
- set_tsk_need_resched(t);
- set_preempt_need_resched();
- }
+ if (rcu_preempt_need_deferred_qs(t))
+ set_need_resched_current();
} else if (rcu_preempt_need_deferred_qs(t)) {
rcu_preempt_deferred_qs(t); /* Report deferred QS. */
return;
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index d16afeb11506..b67532cb8770 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -763,8 +763,7 @@ static void print_cpu_stall(unsigned long gp_seq, unsigned long gps)
* progress and it could be we're stuck in kernel space without context
* switches for an entirely unreasonable amount of time.
*/
- set_tsk_need_resched(current);
- set_preempt_need_resched();
+ set_need_resched_current();
}
static bool csd_lock_suppress_rcu_stall;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2452b7366b00..395d8b002350 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -8,98 +8,7 @@
* Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
*/
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/rseq.h>
-#include <linux/types.h>
-#include <linux/ratelimit.h>
-#include <asm/ptrace.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/rseq.h>
-
-/* The original rseq structure size (including padding) is 32 bytes. */
-#define ORIG_RSEQ_SIZE 32
-
-#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
- RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
- RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
-
-#ifdef CONFIG_DEBUG_RSEQ
-static struct rseq *rseq_kernel_fields(struct task_struct *t)
-{
- return (struct rseq *) t->rseq_fields;
-}
-
-static int rseq_validate_ro_fields(struct task_struct *t)
-{
- static DEFINE_RATELIMIT_STATE(_rs,
- DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
- u32 cpu_id_start, cpu_id, node_id, mm_cid;
- struct rseq __user *rseq = t->rseq;
-
- /*
- * Validate fields which are required to be read-only by
- * user-space.
- */
- if (!user_read_access_begin(rseq, t->rseq_len))
- goto efault;
- unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
- unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
- unsafe_get_user(node_id, &rseq->node_id, efault_end);
- unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
- user_read_access_end();
-
- if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
- cpu_id != rseq_kernel_fields(t)->cpu_id ||
- node_id != rseq_kernel_fields(t)->node_id ||
- mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
-
- pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
- "\tcpu_id_start: %u ?= %u\n"
- "\tcpu_id: %u ?= %u\n"
- "\tnode_id: %u ?= %u\n"
- "\tmm_cid: %u ?= %u\n",
- t->pid, t->comm,
- cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
- cpu_id, rseq_kernel_fields(t)->cpu_id,
- node_id, rseq_kernel_fields(t)->node_id,
- mm_cid, rseq_kernel_fields(t)->mm_cid);
- }
-
- /* For now, only print a console warning on mismatch. */
- return 0;
-
-efault_end:
- user_read_access_end();
-efault:
- return -EFAULT;
-}
-
-/*
- * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
- * state.
- */
-#define rseq_unsafe_put_user(t, value, field, error_label) \
- do { \
- unsafe_put_user(value, &t->rseq->field, error_label); \
- rseq_kernel_fields(t)->field = value; \
- } while (0)
-
-#else
-static int rseq_validate_ro_fields(struct task_struct *t)
-{
- return 0;
-}
-
-#define rseq_unsafe_put_user(t, value, field, error_label) \
- unsafe_put_user(value, &t->rseq->field, error_label)
-#endif
-
/*
- *
* Restartable sequences are a lightweight interface that allows
* user-level code to be executed atomically relative to scheduler
* preemption and signal delivery. Typically used for implementing
@@ -158,356 +67,356 @@ static int rseq_validate_ro_fields(struct task_struct *t)
* F1. <failure>
*/
-static int rseq_update_cpu_node_id(struct task_struct *t)
-{
- struct rseq __user *rseq = t->rseq;
- u32 cpu_id = raw_smp_processor_id();
- u32 node_id = cpu_to_node(cpu_id);
- u32 mm_cid = task_mm_cid(t);
+/* Required to select the proper per_cpu ops for rseq_stats_inc() */
+#define RSEQ_BUILD_SLOW_PATH
- /*
- * Validate read-only rseq fields.
- */
- if (rseq_validate_ro_fields(t))
- goto efault;
- WARN_ON_ONCE((int) mm_cid < 0);
- if (!user_write_access_begin(rseq, t->rseq_len))
- goto efault;
+#include <linux/debugfs.h>
+#include <linux/ratelimit.h>
+#include <linux/rseq_entry.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <asm/ptrace.h>
- rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
- rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
- rseq_unsafe_put_user(t, node_id, node_id, efault_end);
- rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
+#define CREATE_TRACE_POINTS
+#include <trace/events/rseq.h>
- /*
- * Additional feature fields added after ORIG_RSEQ_SIZE
- * need to be conditionally updated only if
- * t->rseq_len != ORIG_RSEQ_SIZE.
- */
- user_write_access_end();
- trace_rseq_update(t);
- return 0;
+DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
-efault_end:
- user_write_access_end();
-efault:
- return -EFAULT;
+static inline void rseq_control_debug(bool on)
+{
+ if (on)
+ static_branch_enable(&rseq_debug_enabled);
+ else
+ static_branch_disable(&rseq_debug_enabled);
}
-static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
+static int __init rseq_setup_debug(char *str)
{
- struct rseq __user *rseq = t->rseq;
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
- mm_cid = 0;
-
- /*
- * Validate read-only rseq fields.
- */
- if (rseq_validate_ro_fields(t))
- goto efault;
+ bool on;
- if (!user_write_access_begin(rseq, t->rseq_len))
- goto efault;
-
- /*
- * Reset all fields to their initial state.
- *
- * All fields have an initial state of 0 except cpu_id which is set to
- * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after
- * unregistration can figure out that rseq needs to be registered
- * again.
- */
- rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end);
- rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
- rseq_unsafe_put_user(t, node_id, node_id, efault_end);
- rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
-
- /*
- * Additional feature fields added after ORIG_RSEQ_SIZE
- * need to be conditionally reset only if
- * t->rseq_len != ORIG_RSEQ_SIZE.
- */
- user_write_access_end();
- return 0;
-
-efault_end:
- user_write_access_end();
-efault:
- return -EFAULT;
+ if (kstrtobool(str, &on))
+ return -EINVAL;
+ rseq_control_debug(on);
+ return 1;
}
+__setup("rseq_debug=", rseq_setup_debug);
+#ifdef CONFIG_TRACEPOINTS
/*
- * Get the user-space pointer value stored in the 'rseq_cs' field.
+ * Out of line, so the actual update functions can be in a header to be
+ * inlined into the exit to user code.
*/
-static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs)
+void __rseq_trace_update(struct task_struct *t)
{
- if (!rseq_cs)
- return -EFAULT;
-
-#ifdef CONFIG_64BIT
- if (get_user(*rseq_cs, &rseq->rseq_cs))
- return -EFAULT;
-#else
- if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs)))
- return -EFAULT;
-#endif
+ trace_rseq_update(t);
+}
- return 0;
+void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+ unsigned long offset, unsigned long abort_ip)
+{
+ trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip);
}
+#endif /* CONFIG_TRACEPOINTS */
-/*
- * If the rseq_cs field of 'struct rseq' contains a valid pointer to
- * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
- */
-static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
+#ifdef CONFIG_DEBUG_FS
+#ifdef CONFIG_RSEQ_STATS
+DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
+
+static int rseq_stats_show(struct seq_file *m, void *p)
{
- struct rseq_cs __user *urseq_cs;
- u64 ptr;
- u32 __user *usig;
- u32 sig;
- int ret;
-
- ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr);
- if (ret)
- return ret;
-
- /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
- if (!ptr) {
- memset(rseq_cs, 0, sizeof(*rseq_cs));
- return 0;
+ struct rseq_stats stats = { };
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ stats.exit += data_race(per_cpu(rseq_stats.exit, cpu));
+ stats.signal += data_race(per_cpu(rseq_stats.signal, cpu));
+ stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu));
+ stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu));
+ stats.ids += data_race(per_cpu(rseq_stats.ids, cpu));
+ stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
+ stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
+ stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
}
- /* Check that the pointer value fits in the user-space process space. */
- if (ptr >= TASK_SIZE)
- return -EINVAL;
- urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
- if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
- return -EFAULT;
- if (rseq_cs->start_ip >= TASK_SIZE ||
- rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
- rseq_cs->abort_ip >= TASK_SIZE ||
- rseq_cs->version > 0)
- return -EINVAL;
- /* Check for overflow. */
- if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
- return -EINVAL;
- /* Ensure that abort_ip is not in the critical section. */
- if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
- return -EINVAL;
+ seq_printf(m, "exit: %16lu\n", stats.exit);
+ seq_printf(m, "signal: %16lu\n", stats.signal);
+ seq_printf(m, "slowp: %16lu\n", stats.slowpath);
+ seq_printf(m, "fastp: %16lu\n", stats.fastpath);
+ seq_printf(m, "ids: %16lu\n", stats.ids);
+ seq_printf(m, "cs: %16lu\n", stats.cs);
+ seq_printf(m, "clear: %16lu\n", stats.clear);
+ seq_printf(m, "fixup: %16lu\n", stats.fixup);
+ return 0;
+}
- usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
- ret = get_user(sig, usig);
- if (ret)
- return ret;
+static int rseq_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rseq_stats_show, inode->i_private);
+}
- if (current->rseq_sig != sig) {
- printk_ratelimited(KERN_WARNING
- "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
- sig, current->rseq_sig, current->pid, usig);
- return -EINVAL;
- }
+static const struct file_operations stat_ops = {
+ .open = rseq_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init rseq_stats_init(struct dentry *root_dir)
+{
+ debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops);
return 0;
}
+#else
+static inline void rseq_stats_init(struct dentry *root_dir) { }
+#endif /* CONFIG_RSEQ_STATS */
-static bool rseq_warn_flags(const char *str, u32 flags)
+static int rseq_debug_show(struct seq_file *m, void *p)
{
- u32 test_flags;
-
- if (!flags)
- return false;
- test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS;
- if (test_flags)
- pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str);
- test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS;
- if (test_flags)
- pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str);
- return true;
+ bool on = static_branch_unlikely(&rseq_debug_enabled);
+
+ seq_printf(m, "%d\n", on);
+ return 0;
}
-static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
+static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
{
- u32 flags, event_mask;
- int ret;
+ bool on;
- if (rseq_warn_flags("rseq_cs", cs_flags))
+ if (kstrtobool_from_user(ubuf, count, &on))
return -EINVAL;
- /* Get thread flags. */
- ret = get_user(flags, &t->rseq->flags);
- if (ret)
- return ret;
+ rseq_control_debug(on);
+ return count;
+}
- if (rseq_warn_flags("rseq", flags))
- return -EINVAL;
+static int rseq_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rseq_debug_show, inode->i_private);
+}
- /*
- * Load and clear event mask atomically with respect to
- * scheduler preemption and membarrier IPIs.
- */
- scoped_guard(RSEQ_EVENT_GUARD) {
- event_mask = t->rseq_event_mask;
- t->rseq_event_mask = 0;
- }
+static const struct file_operations debug_ops = {
+ .open = rseq_debug_open,
+ .read = seq_read,
+ .write = rseq_debug_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init rseq_debugfs_init(void)
+{
+ struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
- return !!event_mask;
+ debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
+ rseq_stats_init(root_dir);
+ return 0;
}
+__initcall(rseq_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
-static int clear_rseq_cs(struct rseq __user *rseq)
+static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
{
- /*
- * The rseq_cs field is set to NULL on preemption or signal
- * delivery on top of rseq assembly block, as well as on top
- * of code outside of the rseq assembly block. This performs
- * a lazy clear of the rseq_cs field.
- *
- * Set rseq_cs to NULL.
- */
-#ifdef CONFIG_64BIT
- return put_user(0UL, &rseq->rseq_cs);
-#else
- if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs)))
- return -EFAULT;
- return 0;
-#endif
+ return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
}
-/*
- * Unsigned comparison will be true when ip >= start_ip, and when
- * ip < start_ip + post_commit_offset.
- */
-static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
+static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
{
- return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
+ struct rseq __user *urseq = t->rseq.usrptr;
+ u64 csaddr;
+
+ scoped_user_read_access(urseq, efault)
+ unsafe_get_user(csaddr, &urseq->rseq_cs, efault);
+ if (likely(!csaddr))
+ return true;
+ return rseq_update_user_cs(t, regs, csaddr);
+efault:
+ return false;
}
-static int rseq_ip_fixup(struct pt_regs *regs)
+static void rseq_slowpath_update_usr(struct pt_regs *regs)
{
- unsigned long ip = instruction_pointer(regs);
+ /*
+ * Preserve rseq state and user_irq state. The generic entry code
+ * clears user_irq on the way out, the non-generic entry
+ * architectures are not having user_irq.
+ */
+ const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
struct task_struct *t = current;
- struct rseq_cs rseq_cs;
- int ret;
+ struct rseq_ids ids;
+ u32 node_id;
+ bool event;
+
+ if (unlikely(t->flags & PF_EXITING))
+ return;
- ret = rseq_get_rseq_cs(t, &rseq_cs);
- if (ret)
- return ret;
+ rseq_stat_inc(rseq_stats.slowpath);
/*
- * Handle potentially not being within a critical section.
- * If not nested over a rseq critical section, restart is useless.
- * Clear the rseq_cs pointer and return.
+ * Read and clear the event pending bit first. If the task
+ * was not preempted or migrated or a signal is on the way,
+ * there is no point in doing any of the heavy lifting here
+ * on production kernels. In that case TIF_NOTIFY_RESUME
+ * was raised by some other functionality.
+ *
+ * This is correct because the read/clear operation is
+ * guarded against scheduler preemption, which makes it CPU
+ * local atomic. If the task is preempted right after
+ * re-enabling preemption then TIF_NOTIFY_RESUME is set
+ * again and this function is invoked another time _before_
+ * the task is able to return to user mode.
+ *
+ * On a debug kernel, invoke the fixup code unconditionally
+ * with the result handed in to allow the detection of
+ * inconsistencies.
*/
- if (!in_rseq_cs(ip, &rseq_cs))
- return clear_rseq_cs(t->rseq);
- ret = rseq_need_restart(t, rseq_cs.flags);
- if (ret <= 0)
- return ret;
- ret = clear_rseq_cs(t->rseq);
- if (ret)
- return ret;
- trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
- rseq_cs.abort_ip);
- instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
- return 0;
+ scoped_guard(irq) {
+ event = t->rseq.event.sched_switch;
+ t->rseq.event.all &= evt_mask.all;
+ ids.cpu_id = task_cpu(t);
+ ids.mm_cid = task_mm_cid(t);
+ }
+
+ if (!event)
+ return;
+
+ node_id = cpu_to_node(ids.cpu_id);
+
+ if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
+ /*
+ * Clear the errors just in case this might survive magically, but
+ * leave the rest intact.
+ */
+ t->rseq.event.error = 0;
+ force_sig(SIGSEGV);
+ }
}
-/*
- * This resume handler must always be executed between any of:
- * - preemption,
- * - signal delivery,
- * and return to user-space.
- *
- * This is how we can ensure that the entire rseq critical section
- * will issue the commit instruction only if executed atomically with
- * respect to other threads scheduled on the same CPU, and with respect
- * to signal handlers.
- */
-void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
+void __rseq_handle_slowpath(struct pt_regs *regs)
{
- struct task_struct *t = current;
- int ret, sig;
-
- if (unlikely(t->flags & PF_EXITING))
+ /*
+ * If invoked from hypervisors before entering the guest via
+ * resume_user_mode_work(), then @regs is a NULL pointer.
+ *
+ * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
+ * it before returning from the ioctl() to user space when
+ * rseq_event.sched_switch is set.
+ *
+ * So it's safe to ignore here instead of pointlessly updating it
+ * in the vcpu_run() loop.
+ */
+ if (!regs)
return;
+ rseq_slowpath_update_usr(regs);
+}
+
+void __rseq_signal_deliver(int sig, struct pt_regs *regs)
+{
+ rseq_stat_inc(rseq_stats.signal);
/*
- * regs is NULL if and only if the caller is in a syscall path. Skip
- * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
- * kill a misbehaving userspace on debug kernels.
+ * Don't update IDs, they are handled on exit to user if
+ * necessary. The important thing is to abort a critical section of
+ * the interrupted context as after this point the instruction
+ * pointer in @regs points to the signal handler.
*/
- if (regs) {
- ret = rseq_ip_fixup(regs);
- if (unlikely(ret < 0))
- goto error;
+ if (unlikely(!rseq_handle_cs(current, regs))) {
+ /*
+ * Clear the errors just in case this might survive
+ * magically, but leave the rest intact.
+ */
+ current->rseq.event.error = 0;
+ force_sigsegv(sig);
}
- if (unlikely(rseq_update_cpu_node_id(t)))
- goto error;
- return;
-
-error:
- sig = ksig ? ksig->sig : 0;
- force_sigsegv(sig);
}
-#ifdef CONFIG_DEBUG_RSEQ
-
/*
* Terminate the process if a syscall is issued within a restartable
* sequence.
*/
-void rseq_syscall(struct pt_regs *regs)
+void __rseq_debug_syscall_return(struct pt_regs *regs)
{
- unsigned long ip = instruction_pointer(regs);
struct task_struct *t = current;
- struct rseq_cs rseq_cs;
+ u64 csaddr;
- if (!t->rseq)
+ if (!t->rseq.event.has_rseq)
return;
- if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
- force_sig(SIGSEGV);
+ if (get_user(csaddr, &t->rseq.usrptr->rseq_cs))
+ goto fail;
+ if (likely(!csaddr))
+ return;
+ if (unlikely(csaddr >= TASK_SIZE))
+ goto fail;
+ if (rseq_debug_update_user_cs(t, regs, csaddr))
+ return;
+fail:
+ force_sig(SIGSEGV);
}
+#ifdef CONFIG_DEBUG_RSEQ
+/* Kept around to keep GENERIC_ENTRY=n architectures supported. */
+void rseq_syscall(struct pt_regs *regs)
+{
+ __rseq_debug_syscall_return(regs);
+}
#endif
+static bool rseq_reset_ids(void)
+{
+ struct rseq_ids ids = {
+ .cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
+ .mm_cid = 0,
+ };
+
+ /*
+ * If this fails, terminate it because this leaves the kernel in
+ * stupid state as exit to user space will try to fixup the ids
+ * again.
+ */
+ if (rseq_set_ids(current, &ids, 0))
+ return true;
+
+ force_sig(SIGSEGV);
+ return false;
+}
+
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
/*
* sys_rseq - setup restartable sequences for caller thread.
*/
-SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
- int, flags, u32, sig)
+SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
{
- int ret;
- u64 rseq_cs;
-
if (flags & RSEQ_FLAG_UNREGISTER) {
if (flags & ~RSEQ_FLAG_UNREGISTER)
return -EINVAL;
/* Unregister rseq for current thread. */
- if (current->rseq != rseq || !current->rseq)
+ if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
return -EINVAL;
- if (rseq_len != current->rseq_len)
+ if (rseq_len != current->rseq.len)
return -EINVAL;
- if (current->rseq_sig != sig)
+ if (current->rseq.sig != sig)
return -EPERM;
- ret = rseq_reset_rseq_cpu_node_id(current);
- if (ret)
- return ret;
- current->rseq = NULL;
- current->rseq_sig = 0;
- current->rseq_len = 0;
+ if (!rseq_reset_ids())
+ return -EFAULT;
+ rseq_reset(current);
return 0;
}
if (unlikely(flags))
return -EINVAL;
- if (current->rseq) {
+ if (current->rseq.usrptr) {
/*
* If rseq is already registered, check whether
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || rseq_len != current->rseq_len)
+ if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
return -EINVAL;
- if (current->rseq_sig != sig)
+ if (current->rseq.sig != sig)
return -EPERM;
/* Already registered. */
return -EBUSY;
@@ -531,43 +440,39 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
if (!access_ok(rseq, rseq_len))
return -EFAULT;
- /*
- * If the rseq_cs pointer is non-NULL on registration, clear it to
- * avoid a potential segfault on return to user-space. The proper thing
- * to do would have been to fail the registration but this would break
- * older libcs that reuse the rseq area for new threads without
- * clearing the fields.
- */
- if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs))
- return -EFAULT;
- if (rseq_cs && clear_rseq_cs(rseq))
- return -EFAULT;
+ scoped_user_write_access(rseq, efault) {
+ /*
+ * If the rseq_cs pointer is non-NULL on registration, clear it to
+ * avoid a potential segfault on return to user-space. The proper thing
+ * to do would have been to fail the registration but this would break
+ * older libcs that reuse the rseq area for new threads without
+ * clearing the fields. Don't bother reading it, just reset it.
+ */
+ unsafe_put_user(0UL, &rseq->rseq_cs, efault);
+ /* Initialize IDs in user space */
+ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
+ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
+ unsafe_put_user(0U, &rseq->node_id, efault);
+ unsafe_put_user(0U, &rseq->mm_cid, efault);
+ }
-#ifdef CONFIG_DEBUG_RSEQ
- /*
- * Initialize the in-kernel rseq fields copy for validation of
- * read-only fields.
- */
- if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
- get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
- get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
- get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
- return -EFAULT;
-#endif
/*
* Activate the registration by setting the rseq area address, length
* and signature in the task struct.
*/
- current->rseq = rseq;
- current->rseq_len = rseq_len;
- current->rseq_sig = sig;
+ current->rseq.usrptr = rseq;
+ current->rseq.len = rseq_len;
+ current->rseq.sig = sig;
/*
* If rseq was previously inactive, and has just been
* registered, ensure the cpu_id_start and cpu_id fields
* are updated before returning to user-space.
*/
- rseq_set_notify_resume(current);
-
+ current->rseq.event.has_rseq = true;
+ rseq_force_update();
return 0;
+
+efault:
+ return -EFAULT;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 198d2dd45f59..fc358c1b6ca9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
#ifdef CONFIG_SCHED_PROXY_EXEC
DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -583,8 +584,8 @@ EXPORT_SYMBOL(__trace_set_current_state);
*
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
*
- * is set by activate_task() and cleared by deactivate_task(), under
- * rq->lock. Non-zero indicates the task is runnable, the special
+ * is set by activate_task() and cleared by deactivate_task()/block_task(),
+ * under rq->lock. Non-zero indicates the task is runnable, the special
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
@@ -2089,6 +2090,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p, flags);
+ rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2121,6 +2123,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
+ rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
@@ -2128,8 +2131,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATED;
- if (flags & ENQUEUE_MIGRATED)
- sched_mm_cid_migrate_to(rq, p);
enqueue_task(rq, p, flags);
@@ -2169,37 +2170,6 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
-/*
- * ->switching_to() is called with the pi_lock and rq_lock held and must not
- * mess with locking.
- */
-void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class)
-{
- if (prev_class != p->sched_class && p->sched_class->switching_to)
- p->sched_class->switching_to(rq, p);
-}
-
-/*
- * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
- * use the balance_callback list if you want balancing.
- *
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
-void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
-{
- if (prev_class != p->sched_class) {
- if (prev_class->switched_from)
- prev_class->switched_from(rq, p);
-
- p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio || dl_task(p))
- p->sched_class->prio_changed(rq, p, oldprio);
-}
-
void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
@@ -2362,7 +2332,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
@@ -2377,10 +2347,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
if (p->cpus_ptr != &p->cpus_mask)
return;
- /*
- * Violates locking rules! See comment in __do_set_cpus_allowed().
- */
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (task_rq_lock, p)
+ do_set_cpus_allowed(p, &ac);
}
void ___migrate_enable(void)
@@ -2613,7 +2581,8 @@ static int migration_cpu_stop(void *data)
*/
WARN_ON_ONCE(!pending->stop_pending);
preempt_disable();
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
preempt_enable();
@@ -2622,7 +2591,8 @@ static int migration_cpu_stop(void *data)
out:
if (pending)
pending->stop_pending = false;
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
if (complete)
complete_all(&pending->done);
@@ -2671,6 +2641,8 @@ out_unlock:
return 0;
}
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask);
+
/*
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
@@ -2684,6 +2656,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
cpumask_copy(&p->cpus_mask, ctx->new_mask);
p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+ mm_update_cpus_allowed(p->mm, ctx->new_mask);
/*
* Swap in a new user_cpus_ptr if SCA_USER flag set
@@ -2693,56 +2666,17 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- struct rq *rq = task_rq(p);
- bool queued, running;
-
- /*
- * This here violates the locking rules for affinity, since we're only
- * supposed to change these variables while holding both rq->lock and
- * p->pi_lock.
- *
- * HOWEVER, it magically works, because ttwu() is the only code that
- * accesses these variables under p->pi_lock and only does so after
- * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
- * before finish_task().
- *
- * XXX do further audits, this smells like something putrid.
- */
- if (ctx->flags & SCA_MIGRATE_DISABLE)
- WARN_ON_ONCE(!p->on_cpu);
- else
- lockdep_assert_held(&p->pi_lock);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued) {
- /*
- * Because __kthread_bind() calls this on blocked tasks without
- * holding rq->lock.
- */
- lockdep_assert_rq_held(rq);
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- }
- if (running)
- put_prev_task(rq, p);
-
- p->sched_class->set_cpus_allowed(p, ctx);
- mm_set_cpus_allowed(p->mm, ctx->new_mask);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->sched_class->set_cpus_allowed(p, ctx);
}
/*
* Used for kthread_bind() and select_fallback_rq(), in both cases the user
* affinity (if any) should be destroyed too.
*/
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
@@ -2754,7 +2688,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
struct rcu_head rcu;
};
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (__task_rq_lock, p)
+ do_set_cpus_allowed(p, &ac);
/*
* Because this is called with p->pi_lock held, it is not possible
@@ -2792,7 +2727,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
* Use pi_lock to protect content of user_cpus_ptr
*
* Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
- * do_set_cpus_allowed().
+ * set_cpus_allowed_force().
*/
raw_spin_lock_irqsave(&src->pi_lock, flags);
if (src->user_cpus_ptr) {
@@ -3064,8 +2999,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
unsigned int dest_cpu;
int ret = 0;
- update_rq_clock(rq);
-
if (kthread || is_migration_disabled(p)) {
/*
* Kernel threads are allowed on online && !active CPUs,
@@ -3120,7 +3053,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
goto out;
}
- __do_set_cpus_allowed(p, ctx);
+ do_set_cpus_allowed(p, ctx);
return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
@@ -3329,8 +3262,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- rseq_migrate(p);
- sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -3529,13 +3460,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
fallthrough;
case possible:
- /*
- * XXX When called from select_task_rq() we only
- * hold p->pi_lock and again violate locking order.
- *
- * More yuck to audit.
- */
- do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
+ set_cpus_allowed_force(p, task_cpu_fallback_mask(p));
state = fail;
break;
case fail:
@@ -3777,7 +3702,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
ttwu_do_wakeup(p);
ret = 1;
}
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
return ret;
}
@@ -4231,7 +4156,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * schedule()'s block_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/
smp_acquire__after_ctrl_dep();
@@ -4370,7 +4295,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
ret = func(p, arg);
if (rq)
- rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
@@ -4487,7 +4412,6 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
- init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4763,7 +4687,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
p->sched_task_group = tg;
}
#endif
- rseq_migrate(p);
/*
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
@@ -4827,7 +4750,6 @@ void wake_up_new_task(struct task_struct *p)
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
- rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
@@ -5121,7 +5043,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
kcov_prepare_switch(prev);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
- rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -5284,19 +5205,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
*
* kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
- *
- * switch_mm_cid() needs to be updated if the barriers provided
- * by context_switch() are modified.
*/
- if (!next->mm) { // to kernel
+ if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
- if (prev->mm) // from user
+ if (prev->mm) // from user
mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
- } else { // to user
+ } else { // to user
membarrier_switch_mm(rq, prev->active_mm, next->mm);
/*
* sys_membarrier() requires an smp_mb() between setting
@@ -5309,15 +5227,20 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_mm_irqs_off(prev->active_mm, next->mm, next);
lru_gen_use_mm(next->mm);
- if (!prev->mm) { // from kernel
+ if (!prev->mm) { // from kernel
/* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
- /* switch_mm_cid() requires the memory barriers above. */
- switch_mm_cid(rq, prev, next);
+ mm_cid_switch_to(prev, next);
+
+ /*
+ * Tell rseq that the task was scheduled in. Must be after
+ * switch_mm_cid() to get the TIF flag set.
+ */
+ rseq_sched_switch_event(next);
prepare_lock_switch(rq, next, rf);
@@ -5602,7 +5525,6 @@ void sched_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
- task_tick_mm_cid(rq, donor);
scx_tick(rq);
rq_unlock(rq, &rf);
@@ -5692,7 +5614,7 @@ static void sched_tick_remote(struct work_struct *work)
* reasonable amount of time.
*/
u64 delta = rq_clock_task(rq) - curr->se.exec_start;
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30);
}
curr->sched_class->task_tick(rq, curr, 0);
@@ -5916,19 +5838,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
const struct sched_class *start_class = prev->sched_class;
const struct sched_class *class;
-#ifdef CONFIG_SCHED_CLASS_EXT
- /*
- * SCX requires a balance() call before every pick_task() including when
- * waking up from SCHED_IDLE. If @start_class is below SCX, start from
- * SCX instead. Also, set a flag to detect missing balance() call.
- */
- if (scx_enabled()) {
- rq->scx.flags |= SCX_RQ_BAL_PENDING;
- if (sched_class_above(&ext_sched_class, start_class))
- start_class = &ext_sched_class;
- }
-#endif
-
/*
* We must do the balancing pass before put_prev_task(), such
* that when we release the rq->lock the task is in the same
@@ -5972,7 +5881,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* Assume the next prioritized class is idle_sched_class */
if (!p) {
- p = pick_task_idle(rq);
+ p = pick_task_idle(rq, rf);
put_prev_set_next_task(rq, prev, p);
}
@@ -5984,11 +5893,15 @@ restart:
for_each_active_class(class) {
if (class->pick_next_task) {
- p = class->pick_next_task(rq, prev);
+ p = class->pick_next_task(rq, prev, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
if (p)
return p;
} else {
- p = class->pick_task(rq);
+ p = class->pick_task(rq, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
if (p) {
put_prev_set_next_task(rq, prev, p);
return p;
@@ -6018,7 +5931,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}
-static inline struct task_struct *pick_task(struct rq *rq)
+/*
+ * Careful; this can return RETRY_TASK, it does not include the retry-loop
+ * itself due to the whole SMT pick retry thing below.
+ */
+static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -6026,7 +5943,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
rq->dl_server = NULL;
for_each_active_class(class) {
- p = class->pick_task(rq);
+ p = class->pick_task(rq, rf);
if (p)
return p;
}
@@ -6041,7 +5958,7 @@ static void queue_core_balance(struct rq *rq);
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- struct task_struct *next, *p, *max = NULL;
+ struct task_struct *next, *p, *max;
const struct cpumask *smt_mask;
bool fi_before = false;
bool core_clock_updated = (rq == rq->core);
@@ -6126,7 +6043,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* and there are no cookied tasks running on siblings.
*/
if (!need_sync) {
- next = pick_task(rq);
+restart_single:
+ next = pick_task(rq, rf);
+ if (unlikely(next == RETRY_TASK))
+ goto restart_single;
if (!next->core_cookie) {
rq->core_pick = NULL;
rq->core_dl_server = NULL;
@@ -6146,6 +6066,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*
* Tie-break prio towards the current CPU
*/
+restart_multi:
+ max = NULL;
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);
@@ -6157,7 +6079,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
- rq_i->core_pick = p = pick_task(rq_i);
+ p = pick_task(rq_i, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart_multi;
+
+ rq_i->core_pick = p;
rq_i->core_dl_server = rq_i->dl_server;
if (!max || prio_less(max, p, fi_before))
@@ -6179,7 +6105,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (cookie)
p = sched_core_find(rq_i, cookie);
if (!p)
- p = idle_sched_class.pick_task(rq_i);
+ p = idle_sched_class.pick_task(rq_i, rf);
}
rq_i->core_pick = p;
@@ -6812,6 +6738,7 @@ static void __sched notrace __schedule(int sched_mode)
local_irq_disable();
rcu_note_context_switch(preempt);
+ migrate_disable_switch(rq, prev);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -6918,7 +6845,6 @@ keep_resched:
*/
++*switch_count;
- migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
prev->se.sched_delayed);
@@ -7326,7 +7252,7 @@ void rt_mutex_post_schedule(void)
*/
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int prio, oldprio, queued, running, queue_flag =
+ int prio, oldprio, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
@@ -7388,64 +7314,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
prev_class = p->sched_class;
next_class = __setscheduler_class(p->policy, prio);
- if (prev_class != next_class && p->se.sched_delayed)
- dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flag);
- if (running)
- put_prev_task(rq, p);
+ if (prev_class != next_class)
+ queue_flag |= DEQUEUE_CLASS;
- /*
- * Boosting condition are:
- * 1. -rt task is running and holds mutex A
- * --> -dl task blocks on mutex A
- *
- * 2. -dl task is running and holds mutex A
- * --> -dl task blocks on mutex A and could preempt the
- * running task
- */
- if (dl_prio(prio)) {
- if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_prio(pi_task->prio) &&
- dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.pi_se = pi_task->dl.pi_se;
- queue_flag |= ENQUEUE_REPLENISH;
+ scoped_guard (sched_change, p, queue_flag) {
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ p->dl.pi_se = pi_task->dl.pi_se;
+ scope->flags |= ENQUEUE_REPLENISH;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (oldprio < prio)
+ scope->flags |= ENQUEUE_HEAD;
} else {
- p->dl.pi_se = &p->dl;
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
}
- } else if (rt_prio(prio)) {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (oldprio < prio)
- queue_flag |= ENQUEUE_HEAD;
- } else {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (rt_prio(oldprio))
- p->rt.timeout = 0;
- }
-
- p->sched_class = next_class;
- p->prio = prio;
-
- check_class_changing(rq, p, prev_class);
- if (queued)
- enqueue_task(rq, p, queue_flag);
- if (running)
- set_next_task(rq, p);
-
- check_class_changed(rq, p, prev_class, oldprio);
+ p->sched_class = next_class;
+ p->prio = prio;
+ }
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
- raw_spin_rq_unlock(rq);
+ rq_repin_lock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
preempt_enable();
}
@@ -8084,26 +7997,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
*/
void sched_setnuma(struct task_struct *p, int nid)
{
- bool queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(p, &rf);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
- if (running)
- put_prev_task(rq, p);
-
- p->numa_preferred_nid = nid;
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
- task_rq_unlock(rq, p, &rf);
+ guard(task_rq_lock)(p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->numa_preferred_nid = nid;
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -8141,18 +8037,15 @@ static int __balance_push_cpu_stop(void *arg)
struct rq_flags rf;
int cpu;
- raw_spin_lock_irq(&p->pi_lock);
- rq_lock(rq, &rf);
-
- update_rq_clock(rq);
-
- if (task_rq(p) == rq && task_on_rq_queued(p)) {
+ scoped_guard (raw_spinlock_irq, &p->pi_lock) {
cpu = select_fallback_rq(rq->cpu, p);
- rq = __migrate_task(rq, &rf, p, cpu);
- }
- rq_unlock(rq, &rf);
- raw_spin_unlock_irq(&p->pi_lock);
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
+ if (task_rq(p) == rq && task_on_rq_queued(p))
+ rq = __migrate_task(rq, &rf, p, cpu);
+ rq_unlock(rq, &rf);
+ }
put_task_struct(p);
@@ -8571,10 +8464,12 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
WARN(true, "Dying CPU not properly vacated!");
dump_rq_tasks(rq, KERN_WARNING);
}
+ dl_server_stop(&rq->fair_server);
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
@@ -8589,6 +8484,8 @@ void __init sched_init_smp(void)
{
sched_init_numa(NUMA_NO_NODE);
+ prandom_init_once(&sched_rnd_state);
+
/*
* There's no userspace yet to cause hotplug operations; hence all the
* CPU masks are stable and all blatant races in the below code cannot
@@ -9205,38 +9102,23 @@ static void sched_change_group(struct task_struct *tsk)
*/
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- int queued, running, queue_flags =
- DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ bool resched = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- update_rq_clock(rq);
-
- running = task_current_donor(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, queue_flags);
- if (running)
- put_prev_task(rq, tsk);
-
- sched_change_group(tsk);
- if (!for_autogroup)
- scx_cgroup_move_task(tsk);
+ scoped_guard (sched_change, tsk, queue_flags) {
+ sched_change_group(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
+ if (scope->running)
+ resched = true;
+ }
- if (queued)
- enqueue_task(rq, tsk, queue_flags);
- if (running) {
- set_next_task(rq, tsk);
- /*
- * After changing group, the running task may have joined a
- * throttled one but it's still the running task. Trigger a
- * resched to make sure that task can still run.
- */
+ if (resched)
resched_curr(rq);
- }
}
static struct cgroup_subsys_state *
@@ -9604,7 +9486,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg,
guard(rq_lock_irq)(rq);
cfs_rq->runtime_enabled = runtime_enabled;
- cfs_rq->runtime_remaining = 0;
+ cfs_rq->runtime_remaining = 1;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
@@ -10372,557 +10254,571 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
}
#ifdef CONFIG_SCHED_MM_CID
-
-/*
- * @cid_lock: Guarantee forward-progress of cid allocation.
- *
- * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
- * is only used when contention is detected by the lock-free allocation so
- * forward progress can be guaranteed.
- */
-DEFINE_RAW_SPINLOCK(cid_lock);
-
-/*
- * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
- *
- * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
- * detected, it is set to 1 to ensure that all newly coming allocations are
- * serialized by @cid_lock until the allocation which detected contention
- * completes and sets @use_cid_lock back to 0. This guarantees forward progress
- * of a cid allocation.
- */
-int use_cid_lock;
-
/*
- * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
- * concurrently with respect to the execution of the source runqueue context
- * switch.
- *
- * There is one basic properties we want to guarantee here:
+ * Concurrency IDentifier management
*
- * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
- * used by a task. That would lead to concurrent allocation of the cid and
- * userspace corruption.
- *
- * Provide this guarantee by introducing a Dekker memory ordering to guarantee
- * that a pair of loads observe at least one of a pair of stores, which can be
- * shown as:
+ * Serialization rules:
*
- * X = Y = 0
+ * mm::mm_cid::mutex: Serializes fork() and exit() and therefore
+ * protects mm::mm_cid::users.
*
- * w[X]=1 w[Y]=1
- * MB MB
- * r[Y]=y r[X]=x
+ * mm::mm_cid::lock: Serializes mm_update_max_cids() and
+ * mm_update_cpus_allowed(). Nests in mm_cid::mutex
+ * and runqueue lock.
*
- * Which guarantees that x==0 && y==0 is impossible. But rather than using
- * values 0 and 1, this algorithm cares about specific state transitions of the
- * runqueue current task (as updated by the scheduler context switch), and the
- * per-mm/cpu cid value.
+ * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks
+ * and can only be modified with atomic operations.
*
- * Let's introduce task (Y) which has task->mm == mm and task (N) which has
- * task->mm != mm for the rest of the discussion. There are two scheduler state
- * transitions on context switch we care about:
+ * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue
+ * lock.
*
- * (TSA) Store to rq->curr with transition from (N) to (Y)
+ * CID ownership:
*
- * (TSB) Store to rq->curr with transition from (Y) to (N)
+ * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
+ * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
+ * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
+ * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
+ * task needs to drop the CID into the pool when scheduling out. Both bits
+ * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
+ * actually handed over to user space in the RSEQ memory.
*
- * On the remote-clear side, there is one transition we care about:
+ * Mode switching:
*
- * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
+ * Switching to per CPU mode happens when the user count becomes greater
+ * than the maximum number of CIDs, which is calculated by:
*
- * There is also a transition to UNSET state which can be performed from all
- * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
- * guarantees that only a single thread will succeed:
+ * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
+ * max_cids = min(1.25 * opt_cids, num_possible_cpus());
*
- * (TMB) cmpxchg to *pcpu_cid to mark UNSET
+ * The +25% allowance is useful for tight CPU masks in scenarios where only
+ * a few threads are created and destroyed to avoid frequent mode
+ * switches. Though this allowance shrinks, the closer opt_cids becomes to
+ * num_possible_cpus(), which is the (unfortunate) hard ABI limit.
*
- * Just to be clear, what we do _not_ want to happen is a transition to UNSET
- * when a thread is actively using the cid (property (1)).
+ * At the point of switching to per CPU mode the new user is not yet
+ * visible in the system, so the task which initiated the fork() runs the
+ * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either transfers each tasks owned CID to the CPU the task runs on or
+ * drops it into the CID pool if a task is not on a CPU at that point in
+ * time. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
+ * it's guaranteed that no task related to that MM owns a CID anymore.
*
- * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
+ * Switching back to task mode happens when the user count goes below the
+ * threshold which was recorded on the per CPU mode switch:
*
- * Scenario A) (TSA)+(TMA) (from next task perspective)
+ * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2);
*
- * CPU0 CPU1
+ * This threshold is updated when a affinity change increases the number of
+ * allowed CPUs for the MM, which might cause a switch back to per task
+ * mode.
*
- * Context switch CS-1 Remote-clear
- * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
- * (implied barrier after cmpxchg)
- * - switch_mm_cid()
- * - memory barrier (see switch_mm_cid()
- * comment explaining how this barrier
- * is combined with other scheduler
- * barriers)
- * - mm_cid_get (next)
- * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
+ * If the switch back was initiated by a exiting task, then that task runs
+ * the fixup function. If it was initiated by a affinity change, then it's
+ * run either in the deferred update function in context of a workqueue or
+ * by a task which forks a new one or by a task which exits. Whatever
+ * happens first. mm_cid_fixup_cpus_to_task() walks through the possible
+ * CPUs and either transfers the CPU owned CIDs to a related task which
+ * runs on the CPU or drops it into the pool. Tasks which schedule in on a
+ * CPU which the walk did not cover yet do the handover themself.
*
- * This Dekker ensures that either task (Y) is observed by the
- * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
- * observed.
+ * This transition from CPU to per task ownership happens in two phases:
*
- * If task (Y) store is observed by rcu_dereference(), it means that there is
- * still an active task on the cpu. Remote-clear will therefore not transition
- * to UNSET, which fulfills property (1).
+ * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
+ * CID and denotes that the CID is only temporarily owned by the
+ * task. When it schedules out the task drops the CID back into the
+ * pool if this bit is set.
*
- * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
- * it will move its state to UNSET, which clears the percpu cid perhaps
- * uselessly (which is not an issue for correctness). Because task (Y) is not
- * observed, CPU1 can move ahead to set the state to UNSET. Because moving
- * state to UNSET is done with a cmpxchg expecting that the old state has the
- * LAZY flag set, only one thread will successfully UNSET.
+ * 2) The initiating context walks the per CPU space and after completion
+ * clears mm:mm_cid.transit. So after that point the CIDs are strictly
+ * task owned again.
*
- * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
- * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
- * CPU1 will observe task (Y) and do nothing more, which is fine.
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail if
+ * two tasks are scheduled in on the same CPU before the fixup freed per
+ * CPU CIDs.
*
- * What we are effectively preventing with this Dekker is a scenario where
- * neither LAZY flag nor store (Y) are observed, which would fail property (1)
- * because this would UNSET a cid which is actively used.
+ * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
+ * related to that MM is owned by a CPU anymore.
*/
-void sched_mm_cid_migrate_from(struct task_struct *t)
-{
- t->migrate_from_cpu = task_cpu(t);
-}
-
-static
-int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid)
+/*
+ * Update the CID range properties when the constraints change. Invoked via
+ * fork(), exit() and affinity changes
+ */
+static void __mm_update_max_cids(struct mm_mm_cid *mc)
{
- struct mm_struct *mm = t->mm;
- struct task_struct *src_task;
- int src_cid, last_mm_cid;
+ unsigned int opt_cids, max_cids;
- if (!mm)
- return -1;
+ /* Calculate the new optimal constraint */
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
- last_mm_cid = t->last_mm_cid;
- /*
- * If the migrated task has no last cid, or if the current
- * task on src rq uses the cid, it means the source cid does not need
- * to be moved to the destination cpu.
- */
- if (last_mm_cid == -1)
- return -1;
- src_cid = READ_ONCE(src_pcpu_cid->cid);
- if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
- return -1;
+ /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */
+ max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus());
+ WRITE_ONCE(mc->max_cids, max_cids);
+}
- /*
- * If we observe an active task using the mm on this rq, it means we
- * are not the last task to be migrated from this cpu for this mm, so
- * there is no need to move src_cid to the destination cpu.
- */
- guard(rcu)();
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- t->last_mm_cid = -1;
- return -1;
- }
+static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
+{
+ unsigned int opt_cids;
- return src_cid;
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
+ /* Has to be at least 1 because 0 indicates PCPU mode off */
+ return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1);
}
-static
-int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid,
- int src_cid)
+static bool mm_update_max_cids(struct mm_struct *mm)
{
- struct task_struct *src_task;
- struct mm_struct *mm = t->mm;
- int lazy_cid;
-
- if (src_cid == -1)
- return -1;
+ struct mm_mm_cid *mc = &mm->mm_cid;
- /*
- * Attempt to clear the source cpu cid to move it to the destination
- * cpu.
- */
- lazy_cid = mm_cid_set_lazy_put(src_cid);
- if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
- return -1;
+ lockdep_assert_held(&mm->mm_cid.lock);
- /*
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm matches the scheduler barrier in context_switch()
- * between store to rq->curr and load of prev and next task's
- * per-mm/cpu cid.
- *
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm_cid_active matches the barrier in
- * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
- * sched_mm_cid_after_execve() between store to t->mm_cid_active and
- * load of per-mm/cpu cid.
- */
+ /* Clear deferred mode switch flag. A change is handled by the caller */
+ mc->update_deferred = false;
+ __mm_update_max_cids(mc);
- /*
- * If we observe an active task using the mm on this rq after setting
- * the lazy-put flag, this task will be responsible for transitioning
- * from lazy-put flag set to MM_CID_UNSET.
- */
- scoped_guard (rcu) {
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- /*
- * We observed an active task for this mm, there is therefore
- * no point in moving this cid to the destination cpu.
- */
- t->last_mm_cid = -1;
- return -1;
- }
+ /* Check whether owner mode must be changed */
+ if (!mc->percpu) {
+ /* Enable per CPU mode when the number of users is above max_cids */
+ if (mc->users > mc->max_cids)
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ } else {
+ /* Switch back to per task if user count under threshold */
+ if (mc->users < mc->pcpu_thrs)
+ mc->pcpu_thrs = 0;
}
- /*
- * The src_cid is unused, so it can be unset.
- */
- if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- return -1;
- WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
- return src_cid;
+ /* Mode change required? */
+ if (!!mc->percpu == !!mc->pcpu_thrs)
+ return false;
+ /* When switching back to per TASK mode, set the transition flag */
+ if (!mc->pcpu_thrs)
+ WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
+ WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
+ return true;
}
-/*
- * Migration to dst cpu. Called with dst_rq lock held.
- * Interrupts are disabled, which keeps the window of cid ownership without the
- * source rq lock held small.
- */
-void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
{
- struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
- struct mm_struct *mm = t->mm;
- int src_cid, src_cpu;
- bool dst_cid_is_set;
- struct rq *src_rq;
-
- lockdep_assert_rq_held(dst_rq);
+ struct cpumask *mm_allowed;
+ struct mm_mm_cid *mc;
+ unsigned int weight;
- if (!mm)
+ if (!mm || !READ_ONCE(mm->mm_cid.users))
return;
- src_cpu = t->migrate_from_cpu;
- if (src_cpu == -1) {
- t->last_mm_cid = -1;
- return;
- }
/*
- * Move the src cid if the dst cid is unset. This keeps id
- * allocation closest to 0 in cases where few threads migrate around
- * many CPUs.
- *
- * If destination cid or recent cid is already set, we may have
- * to just clear the src cid to ensure compactness in frequent
- * migrations scenarios.
- *
- * It is not useful to clear the src cid when the number of threads is
- * greater or equal to the number of allowed CPUs, because user-space
- * can expect that the number of allowed cids can reach the number of
- * allowed CPUs.
- */
- dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
- dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) ||
- !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
- if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
+ * mm::mm_cid::mm_cpus_allowed is the superset of each threads
+ * allowed CPUs mask which means it can only grow.
+ */
+ mc = &mm->mm_cid;
+ guard(raw_spinlock)(&mc->lock);
+ mm_allowed = mm_cpus_allowed(mm);
+ weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk);
+ if (weight == mc->nr_cpus_allowed)
return;
- src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
- src_rq = cpu_rq(src_cpu);
- src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
- if (src_cid == -1)
+
+ WRITE_ONCE(mc->nr_cpus_allowed, weight);
+ __mm_update_max_cids(mc);
+ if (!mc->percpu)
return;
- src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
- src_cid);
- if (src_cid == -1)
+
+ /* Adjust the threshold to the wider set */
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ /* Switch back to per task mode? */
+ if (mc->users >= mc->pcpu_thrs)
return;
- if (dst_cid_is_set) {
- __mm_cid_put(mm, src_cid);
+
+ /* Don't queue twice */
+ if (mc->update_deferred)
return;
- }
- /* Move src_cid to dst cpu. */
- mm_cid_snapshot_time(dst_rq, mm);
- WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
- WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
+
+ /* Queue the irq work, which schedules the real work */
+ mc->update_deferred = true;
+ irq_work_queue(&mc->irq_work);
}
-static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
- int cpu)
+static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
- struct rq *rq = cpu_rq(cpu);
- struct task_struct *t;
- int cid, lazy_cid;
+ if (cid_on_cpu(t->mm_cid.cid)) {
+ unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid);
- cid = READ_ONCE(pcpu_cid->cid);
- if (!mm_cid_is_valid(cid))
- return;
+ t->mm_cid.cid = cid_to_transit_cid(cid);
+ pcp->cid = t->mm_cid.cid;
+ }
+}
- /*
- * Clear the cpu cid if it is set to keep cid allocation compact. If
- * there happens to be other tasks left on the source cpu using this
- * mm, the next task using this mm will reallocate its cid on context
- * switch.
- */
- lazy_cid = mm_cid_set_lazy_put(cid);
- if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
- return;
+static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
+{
+ unsigned int cpu;
- /*
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm matches the scheduler barrier in context_switch()
- * between store to rq->curr and load of prev and next task's
- * per-mm/cpu cid.
- *
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm_cid_active matches the barrier in
- * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
- * sched_mm_cid_after_execve() between store to t->mm_cid_active and
- * load of per-mm/cpu cid.
- */
+ /* Walk the CPUs and fixup all stale CIDs */
+ for_each_possible_cpu(cpu) {
+ struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu);
+ struct rq *rq = cpu_rq(cpu);
- /*
- * If we observe an active task using the mm on this rq after setting
- * the lazy-put flag, that task will be responsible for transitioning
- * from lazy-put flag set to MM_CID_UNSET.
- */
- scoped_guard (rcu) {
- t = rcu_dereference(rq->curr);
- if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
- return;
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(rq_lock_irq)(rq);
+ /* Is the CID still owned by the CPU? */
+ if (cid_on_cpu(pcp->cid)) {
+ /*
+ * If rq->curr has @mm, transfer it with the
+ * transition bit set. Otherwise drop it.
+ */
+ if (rq->curr->mm == mm && rq->curr->mm_cid.active)
+ mm_cid_transit_to_task(rq->curr, pcp);
+ else
+ mm_drop_cid_on_cpu(mm, pcp);
+
+ } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) {
+ unsigned int cid = rq->curr->mm_cid.cid;
+
+ /* Ensure it has the transition bit set */
+ if (!cid_in_transit(cid)) {
+ cid = cid_to_transit_cid(cid);
+ rq->curr->mm_cid.cid = cid;
+ pcp->cid = cid;
+ }
+ }
}
+ /* Clear the transition bit */
+ WRITE_ONCE(mm->mm_cid.transit, 0);
+}
- /*
- * The cid is unused, so it can be unset.
- * Disable interrupts to keep the window of cid ownership without rq
- * lock small.
- */
- scoped_guard (irqsave) {
- if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- __mm_cid_put(mm, cid);
+static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+{
+ if (cid_on_task(t->mm_cid.cid)) {
+ t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+ pcp->cid = t->mm_cid.cid;
}
}
-static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
{
- struct rq *rq = cpu_rq(cpu);
- struct mm_cid *pcpu_cid;
- struct task_struct *curr;
- u64 rq_clock;
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(task_rq_lock)(t);
+ /* If the task is not active it is not in the users count */
+ if (!t->mm_cid.active)
+ return false;
+ if (cid_on_task(t->mm_cid.cid)) {
+ /* If running on the CPU, transfer the CID, otherwise drop it */
+ if (task_rq(t)->curr == t)
+ mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+ else
+ mm_unset_cid_on_task(t);
+ }
+ return true;
+}
- /*
- * rq->clock load is racy on 32-bit but one spurious clear once in a
- * while is irrelevant.
- */
- rq_clock = READ_ONCE(rq->clock);
- pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+static void mm_cid_fixup_tasks_to_cpus(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct task_struct *p, *t;
+ unsigned int users;
/*
- * In order to take care of infrequently scheduled tasks, bump the time
- * snapshot associated with this cid if an active task using the mm is
- * observed on this rq.
+ * This can obviously race with a concurrent affinity change, which
+ * increases the number of allowed CPUs for this mm, but that does
+ * not affect the mode and only changes the CID constraints. A
+ * possible switch back to per task mode happens either in the
+ * deferred handler function or in the next fork()/exit().
+ *
+ * The caller has already transferred. The newly incoming task is
+ * already accounted for, but not yet visible.
*/
- scoped_guard (rcu) {
- curr = rcu_dereference(rq->curr);
- if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
- WRITE_ONCE(pcpu_cid->time, rq_clock);
- return;
- }
+ users = mm->mm_cid.users - 2;
+ if (!users)
+ return;
+
+ guard(rcu)();
+ for_other_threads(current, t) {
+ if (mm_cid_fixup_task_to_cpu(t, mm))
+ users--;
}
- if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
+ if (!users)
return;
- sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+
+ /* Happens only for VM_CLONE processes. */
+ for_each_process_thread(p, t) {
+ if (t == current || t->mm != mm)
+ continue;
+ if (mm_cid_fixup_task_to_cpu(t, mm)) {
+ if (--users == 0)
+ return;
+ }
+ }
}
-static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
- int weight)
+static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
- struct mm_cid *pcpu_cid;
- int cid;
-
- pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
- cid = READ_ONCE(pcpu_cid->cid);
- if (!mm_cid_is_valid(cid) || cid < weight)
- return;
- sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+ t->mm_cid.active = 1;
+ mm->mm_cid.users++;
+ return mm_update_max_cids(mm);
}
-static void task_mm_cid_work(struct callback_head *work)
+void sched_mm_cid_fork(struct task_struct *t)
{
- unsigned long now = jiffies, old_scan, next_scan;
- struct task_struct *t = current;
- struct cpumask *cidmask;
- struct mm_struct *mm;
- int weight, cpu;
+ struct mm_struct *mm = t->mm;
+ bool percpu;
- WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
+ WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
- work->next = work; /* Prevent double-add */
- if (t->flags & PF_EXITING)
- return;
- mm = t->mm;
- if (!mm)
- return;
- old_scan = READ_ONCE(mm->mm_cid_next_scan);
- next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
- if (!old_scan) {
- unsigned long res;
-
- res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
- if (res != old_scan)
- old_scan = res;
+ guard(mutex)(&mm->mm_cid.mutex);
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+
+ /* First user ? */
+ if (!mm->mm_cid.users) {
+ sched_mm_cid_add_user(t, mm);
+ t->mm_cid.cid = mm_get_cid(mm);
+ /* Required for execve() */
+ pcp->cid = t->mm_cid.cid;
+ return;
+ }
+
+ if (!sched_mm_cid_add_user(t, mm)) {
+ if (!mm->mm_cid.percpu)
+ t->mm_cid.cid = mm_get_cid(mm);
+ return;
+ }
+
+ /* Handle the mode change and transfer current's CID */
+ percpu = !!mm->mm_cid.percpu;
+ if (!percpu)
+ mm_cid_transit_to_task(current, pcp);
else
- old_scan = next_scan;
+ mm_cid_transfer_to_cpu(current, pcp);
}
- if (time_before(now, old_scan))
- return;
- if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
- return;
- cidmask = mm_cidmask(mm);
- /* Clear cids that were not recently used. */
- for_each_possible_cpu(cpu)
- sched_mm_cid_remote_clear_old(mm, cpu);
- weight = cpumask_weight(cidmask);
- /*
- * Clear cids that are greater or equal to the cidmask weight to
- * recompact it.
- */
- for_each_possible_cpu(cpu)
- sched_mm_cid_remote_clear_weight(mm, cpu, weight);
-}
-void init_sched_mm_cid(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
- int mm_users = 0;
-
- if (mm) {
- mm_users = atomic_read(&mm->mm_users);
- if (mm_users == 1)
- mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ if (percpu) {
+ mm_cid_fixup_tasks_to_cpus();
+ } else {
+ mm_cid_fixup_cpus_to_tasks(mm);
+ t->mm_cid.cid = mm_get_cid(mm);
}
- t->cid_work.next = &t->cid_work; /* Protect against double add */
- init_task_work(&t->cid_work, task_mm_cid_work);
}
-void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+static bool sched_mm_cid_remove_user(struct task_struct *t)
{
- struct callback_head *work = &curr->cid_work;
- unsigned long now = jiffies;
-
- if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
- work->next != work)
- return;
- if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
- return;
-
- /* No page allocation under rq lock */
- task_work_add(curr, work, TWA_RESUME);
+ t->mm_cid.active = 0;
+ scoped_guard(preempt) {
+ /* Clear the transition bit */
+ t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+ mm_unset_cid_on_task(t);
+ }
+ t->mm->mm_cid.users--;
+ return mm_update_max_cids(t->mm);
}
-void sched_mm_cid_exit_signals(struct task_struct *t)
+static bool __sched_mm_cid_exit(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq *rq;
- if (!mm)
- return;
-
- preempt_disable();
- rq = this_rq();
- guard(rq_lock_irqsave)(rq);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 0);
+ if (!sched_mm_cid_remove_user(t))
+ return false;
/*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
+ * Contrary to fork() this only deals with a switch back to per
+ * task mode either because the above decreased users or an
+ * affinity change increased the number of allowed CPUs and the
+ * deferred fixup did not run yet.
*/
- smp_mb();
- mm_cid_put(mm);
- t->last_mm_cid = t->mm_cid = -1;
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return false;
+ /*
+ * A failed fork(2) cleanup never gets here, so @current must have
+ * the same MM as @t. That's true for exit() and the failed
+ * pthread_create() cleanup case.
+ */
+ if (WARN_ON_ONCE(current->mm != mm))
+ return false;
+ return true;
}
-void sched_mm_cid_before_execve(struct task_struct *t)
+/*
+ * When a task exits, the MM CID held by the task is not longer required as
+ * the task cannot return to user space.
+ */
+void sched_mm_cid_exit(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq *rq;
- if (!mm)
+ if (!mm || !t->mm_cid.active)
return;
+ /*
+ * Ensure that only one instance is doing MM CID operations within
+ * a MM. The common case is uncontended. The rare fixup case adds
+ * some overhead.
+ */
+ scoped_guard(mutex, &mm->mm_cid.mutex) {
+ /* mm_cid::mutex is sufficient to protect mm_cid::users */
+ if (likely(mm->mm_cid.users > 1)) {
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ if (!__sched_mm_cid_exit(t))
+ return;
+ /* Mode change required. Transfer currents CID */
+ mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+ }
+ mm_cid_fixup_cpus_to_tasks(mm);
+ return;
+ }
+ /* Last user */
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Required across execve() */
+ if (t == current)
+ mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+ /* Ignore mode change. There is nothing to do. */
+ sched_mm_cid_remove_user(t);
+ }
+ }
- preempt_disable();
- rq = this_rq();
- guard(rq_lock_irqsave)(rq);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 0);
/*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
+ * As this is the last user (execve(), process exit or failed
+ * fork(2)) there is no concurrency anymore.
+ *
+ * Synchronize eventually pending work to ensure that there are no
+ * dangling references left. @t->mm_cid.users is zero so nothing
+ * can queue this work anymore.
*/
- smp_mb();
- mm_cid_put(mm);
- t->last_mm_cid = t->mm_cid = -1;
+ irq_work_sync(&mm->mm_cid.irq_work);
+ cancel_work_sync(&mm->mm_cid.work);
+}
+
+/* Deactivate MM CID allocation across execve() */
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ sched_mm_cid_exit(t);
}
+/* Reactivate MM CID after successful execve() */
void sched_mm_cid_after_execve(struct task_struct *t)
{
- struct mm_struct *mm = t->mm;
- struct rq *rq;
+ sched_mm_cid_fork(t);
+}
- if (!mm)
+static void mm_cid_work_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
+
+ guard(mutex)(&mm->mm_cid.mutex);
+ /* Did the last user task exit already? */
+ if (!mm->mm_cid.users)
return;
- preempt_disable();
- rq = this_rq();
- scoped_guard (rq_lock_irqsave, rq) {
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 1);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Have fork() or exit() handled it already? */
+ if (!mm->mm_cid.update_deferred)
+ return;
+ /* This clears mm_cid::update_deferred */
+ if (!mm_update_max_cids(mm))
+ return;
+ /* Affinity changes can only switch back to task mode */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return;
}
+ mm_cid_fixup_cpus_to_tasks(mm);
}
-void sched_mm_cid_fork(struct task_struct *t)
+static void mm_cid_irq_work(struct irq_work *work)
{
- WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
- t->mm_cid_active = 1;
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work);
+
+ /*
+ * Needs to be unconditional because mm_cid::lock cannot be held
+ * when scheduling work as mm_update_cpus_allowed() nests inside
+ * rq::lock and schedule_work() might end up in wakeup...
+ */
+ schedule_work(&mm->mm_cid.work);
}
-#endif /* CONFIG_SCHED_MM_CID */
-#ifdef CONFIG_SCHED_CLASS_EXT
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx)
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+{
+ mm->mm_cid.max_cids = 0;
+ mm->mm_cid.percpu = 0;
+ mm->mm_cid.transit = 0;
+ mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+ mm->mm_cid.users = 0;
+ mm->mm_cid.pcpu_thrs = 0;
+ mm->mm_cid.update_deferred = 0;
+ raw_spin_lock_init(&mm->mm_cid.lock);
+ mutex_init(&mm->mm_cid.mutex);
+ mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
+ INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
+ cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
+ bitmap_zero(mm_cidmask(mm), num_possible_cpus());
+}
+#else /* CONFIG_SCHED_MM_CID */
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+#endif /* !CONFIG_SCHED_MM_CID */
+
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
{
+ struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
struct rq *rq = task_rq(p);
+ /*
+ * Must exclusively use matched flags since this is both dequeue and
+ * enqueue.
+ */
+ WARN_ON_ONCE(flags & 0xFFFF0000);
+
lockdep_assert_rq_held(rq);
- *ctx = (struct sched_enq_and_set_ctx){
+ if (!(flags & DEQUEUE_NOCLOCK)) {
+ update_rq_clock(rq);
+ flags |= DEQUEUE_NOCLOCK;
+ }
+
+ if (flags & DEQUEUE_CLASS) {
+ if (p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
+ }
+
+ *ctx = (struct sched_change_ctx){
.p = p,
- .queue_flags = queue_flags,
+ .flags = flags,
.queued = task_on_rq_queued(p),
- .running = task_current(rq, p),
+ .running = task_current_donor(rq, p),
};
- update_rq_clock(rq);
+ if (!(flags & DEQUEUE_CLASS)) {
+ if (p->sched_class->get_prio)
+ ctx->prio = p->sched_class->get_prio(rq, p);
+ else
+ ctx->prio = p->prio;
+ }
+
if (ctx->queued)
- dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ dequeue_task(rq, p, flags);
if (ctx->running)
put_prev_task(rq, p);
+
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+ p->sched_class->switched_from(rq, p);
+
+ return ctx;
}
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+void sched_change_end(struct sched_change_ctx *ctx)
{
- struct rq *rq = task_rq(ctx->p);
+ struct task_struct *p = ctx->p;
+ struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+
if (ctx->queued)
- enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ enqueue_task(rq, p, ctx->flags);
if (ctx->running)
- set_next_task(rq, ctx->p);
+ set_next_task(rq, p);
+
+ if (ctx->flags & ENQUEUE_CLASS) {
+ if (p->sched_class->switched_to)
+ p->sched_class->switched_to(rq, p);
+ } else {
+ p->sched_class->prio_changed(rq, p, ctx->prio);
+ }
}
-#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index cdd740b3f774..37b572cc8aca 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -166,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
* cpudl_clear - remove a CPU from the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target CPU
+ * @online: the online state of the deadline runqueue
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
-void cpudl_clear(struct cpudl *cp, int cpu)
+void cpudl_clear(struct cpudl *cp, int cpu, bool online)
{
int old_idx, new_cpu;
unsigned long flags;
@@ -184,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu)
if (old_idx == IDX_INVALID) {
/*
* Nothing to remove if old_idx was invalid.
- * This could happen if a rq_offline_dl is
+ * This could happen if rq_online_dl or rq_offline_dl is
* called for a CPU without -dl tasks running.
*/
} else {
@@ -195,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu)
cp->elements[new_cpu].idx = old_idx;
cp->elements[cpu].idx = IDX_INVALID;
cpudl_heapify(cp, old_idx);
-
- cpumask_set_cpu(cpu, cp->free_cpus);
}
+ if (likely(online))
+ __cpumask_set_cpu(cpu, cp->free_cpus);
+ else
+ __cpumask_clear_cpu(cpu, cp->free_cpus);
+
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
@@ -228,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
cp->elements[new_idx].cpu = cpu;
cp->elements[cpu].idx = new_idx;
cpudl_heapify_up(cp, new_idx);
- cpumask_clear_cpu(cpu, cp->free_cpus);
+ __cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
cp->elements[old_idx].dl = dl;
cpudl_heapify(cp, old_idx);
@@ -238,26 +242,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
}
/*
- * cpudl_set_freecpu - Set the cpudl.free_cpus
- * @cp: the cpudl max-heap context
- * @cpu: rd attached CPU
- */
-void cpudl_set_freecpu(struct cpudl *cp, int cpu)
-{
- cpumask_set_cpu(cpu, cp->free_cpus);
-}
-
-/*
- * cpudl_clear_freecpu - Clear the cpudl.free_cpus
- * @cp: the cpudl max-heap context
- * @cpu: rd attached CPU
- */
-void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
-{
- cpumask_clear_cpu(cpu, cp->free_cpus);
-}
-
-/*
* cpudl_init - initialize the cpudl structure
* @cp: the cpudl max-heap context
*/
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 11c0f1faa7e1..d7699468eedd 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -19,8 +19,6 @@ struct cpudl {
int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
-void cpudl_clear(struct cpudl *cp, int cpu);
+void cpudl_clear(struct cpudl *cp, int cpu, bool online);
int cpudl_init(struct cpudl *cp);
-void cpudl_set_freecpu(struct cpudl *cp, int cpu);
-void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7097de2c8cda..4f97896887ec 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -313,10 +313,8 @@ static u64 read_sum_exec_runtime(struct task_struct *t)
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
struct signal_struct *sig = tsk->signal;
- u64 utime, stime;
struct task_struct *t;
- unsigned int seq, nextseq;
- unsigned long flags;
+ u64 utime, stime;
/*
* Update current task runtime to account pending time since last
@@ -329,27 +327,19 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
if (same_thread_group(current, tsk))
(void) task_sched_runtime(current);
- rcu_read_lock();
- /* Attempt a lockless read on the first round. */
- nextseq = 0;
- do {
- seq = nextseq;
- flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ guard(rcu)();
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
times->utime = sig->utime;
times->stime = sig->stime;
times->sum_exec_runtime = sig->sum_sched_runtime;
- for_each_thread(tsk, t) {
+ __for_each_thread(sig, t) {
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += read_sum_exec_runtime(t);
}
- /* If lockless access failed, take the lock. */
- nextseq = 1;
- } while (need_seqretry(&sig->stats_lock, seq));
- done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- rcu_read_unlock();
+ }
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 615411a0a881..67f540c23717 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -125,20 +125,11 @@ static inline struct dl_bw *dl_bw_of(int i)
static inline int dl_bw_cpus(int i)
{
struct root_domain *rd = cpu_rq(i)->rd;
- int cpus;
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
"sched RCU must be held");
- if (cpumask_subset(rd->span, cpu_active_mask))
- return cpumask_weight(rd->span);
-
- cpus = 0;
-
- for_each_cpu_and(i, rd->span, cpu_active_mask)
- cpus++;
-
- return cpus;
+ return cpumask_weight_and(rd->span, cpu_active_mask);
}
static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
@@ -405,7 +396,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
* up, and checks if the task is still in the "ACTIVE non contending"
* state or not (in the second case, it updates running_bw).
*/
-static void task_non_contending(struct sched_dl_entity *dl_se)
+static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task)
{
struct hrtimer *timer = &dl_se->inactive_timer;
struct rq *rq = rq_of_dl_se(dl_se);
@@ -444,10 +435,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
} else {
struct task_struct *p = dl_task_of(dl_se);
- if (dl_task(p))
+ if (dl_task)
sub_running_bw(dl_se, dl_rq);
- if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+ if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (READ_ONCE(p->__state) == TASK_DEAD)
@@ -1166,8 +1157,17 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
sched_clock_tick();
update_rq_clock(rq);
- if (!dl_se->dl_runtime)
+ /*
+ * Make sure current has propagated its pending runtime into
+ * any relevant server through calling dl_server_update() and
+ * friends.
+ */
+ rq->donor->sched_class->update_curr(rq);
+
+ if (dl_se->dl_defer_idle) {
+ dl_server_stop(dl_se);
return HRTIMER_NORESTART;
+ }
if (dl_se->dl_defer_armed) {
/*
@@ -1416,10 +1416,11 @@ s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta
}
static inline void
-update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
- int flags);
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags);
+
static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
{
+ bool idle = rq->curr == rq->idle;
s64 scaled_delta_exec;
if (unlikely(delta_exec <= 0)) {
@@ -1440,6 +1441,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
dl_se->runtime -= scaled_delta_exec;
+ if (dl_se->dl_defer_idle && !idle)
+ dl_se->dl_defer_idle = 0;
+
/*
* The fair server can consume its runtime while throttled (not queued/
* running as regular CFS).
@@ -1450,6 +1454,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
*/
if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) {
/*
+ * Non-servers would never get time accounted while throttled.
+ */
+ WARN_ON_ONCE(!dl_server(dl_se));
+
+ /*
+ * While the server is marked idle, do not push out the
+ * activation further, instead wait for the period timer
+ * to lapse and stop the server.
+ */
+ if (dl_se->dl_defer_idle && idle) {
+ /*
+ * The timer is at the zero-laxity point, this means
+ * dl_server_stop() / dl_server_start() can happen
+ * while now < deadline. This means update_dl_entity()
+ * will not replenish. Additionally start_dl_timer()
+ * will be set for 'deadline - runtime'. Negative
+ * runtime will not do.
+ */
+ dl_se->runtime = 0;
+ return;
+ }
+
+ /*
* If the server was previously activated - the starving condition
* took place, it this point it went away because the fair scheduler
* was able to get runtime in background. So return to the initial
@@ -1461,6 +1488,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
replenish_dl_new_period(dl_se, dl_se->rq);
+ if (idle)
+ dl_se->dl_defer_idle = 1;
+
/*
* Not being able to start the timer seems problematic. If it could not
* be started for whatever reason, we need to "unthrottle" the DL server
@@ -1543,38 +1573,213 @@ throttle:
* as time available for the fair server, avoiding a penalty for the
* rt scheduler that did not consumed that time.
*/
-void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
+void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec)
{
- s64 delta_exec;
-
- if (!rq->fair_server.dl_defer)
- return;
-
- /* no need to discount more */
- if (rq->fair_server.runtime < 0)
- return;
-
- delta_exec = rq_clock_task(rq) - p->se.exec_start;
- if (delta_exec < 0)
- return;
-
- rq->fair_server.runtime -= delta_exec;
-
- if (rq->fair_server.runtime < 0) {
- rq->fair_server.dl_defer_running = 0;
- rq->fair_server.runtime = 0;
- }
-
- p->se.exec_start = rq_clock_task(rq);
+ if (dl_se->dl_server_active && dl_se->dl_runtime && dl_se->dl_defer)
+ update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
}
void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
{
/* 0 runtime = fair server disabled */
- if (dl_se->dl_runtime)
+ if (dl_se->dl_server_active && dl_se->dl_runtime)
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
}
+/*
+ * dl_server && dl_defer:
+ *
+ * 6
+ * +--------------------+
+ * v |
+ * +-------------+ 4 +-----------+ 5 +------------------+
+ * +-> | A:init | <--- | D:running | -----> | E:replenish-wait |
+ * | +-------------+ +-----------+ +------------------+
+ * | | | 1 ^ ^ |
+ * | | 1 +----------+ | 3 |
+ * | v | |
+ * | +--------------------------------+ 2 |
+ * | | | ----+ |
+ * | 8 | B:zero_laxity-wait | | |
+ * | | | <---+ |
+ * | +--------------------------------+ |
+ * | | ^ ^ 2 |
+ * | | 7 | 2 +--------------------+
+ * | v |
+ * | +-------------+ |
+ * +-- | C:idle-wait | -+
+ * +-------------+
+ * ^ 7 |
+ * +---------+
+ *
+ *
+ * [A] - init
+ * dl_server_active = 0
+ * dl_throttled = 0
+ * dl_defer_armed = 0
+ * dl_defer_running = 0/1
+ * dl_defer_idle = 0
+ *
+ * [B] - zero_laxity-wait
+ * dl_server_active = 1
+ * dl_throttled = 1
+ * dl_defer_armed = 1
+ * dl_defer_running = 0
+ * dl_defer_idle = 0
+ *
+ * [C] - idle-wait
+ * dl_server_active = 1
+ * dl_throttled = 1
+ * dl_defer_armed = 1
+ * dl_defer_running = 0
+ * dl_defer_idle = 1
+ *
+ * [D] - running
+ * dl_server_active = 1
+ * dl_throttled = 0
+ * dl_defer_armed = 0
+ * dl_defer_running = 1
+ * dl_defer_idle = 0
+ *
+ * [E] - replenish-wait
+ * dl_server_active = 1
+ * dl_throttled = 1
+ * dl_defer_armed = 0
+ * dl_defer_running = 1
+ * dl_defer_idle = 0
+ *
+ *
+ * [1] A->B, A->D
+ * dl_server_start()
+ * dl_server_active = 1;
+ * enqueue_dl_entity()
+ * update_dl_entity(WAKEUP)
+ * if (!dl_defer_running)
+ * dl_defer_armed = 1;
+ * dl_throttled = 1;
+ * if (dl_throttled && start_dl_timer())
+ * return; // [B]
+ * __enqueue_dl_entity();
+ * // [D]
+ *
+ * // deplete server runtime from client-class
+ * [2] B->B, C->B, E->B
+ * dl_server_update()
+ * update_curr_dl_se() // idle = false
+ * if (dl_defer_idle)
+ * dl_defer_idle = 0;
+ * if (dl_defer && dl_throttled && dl_runtime_exceeded())
+ * dl_defer_running = 0;
+ * hrtimer_try_to_cancel(); // stop timer
+ * replenish_dl_new_period()
+ * // fwd period
+ * dl_throttled = 1;
+ * dl_defer_armed = 1;
+ * start_dl_timer(); // restart timer
+ * // [B]
+ *
+ * // timer actually fires means we have runtime
+ * [3] B->D
+ * dl_server_timer()
+ * if (dl_defer_armed)
+ * dl_defer_running = 1;
+ * enqueue_dl_entity(REPLENISH)
+ * replenish_dl_entity()
+ * // fwd period
+ * if (dl_throttled)
+ * dl_throttled = 0;
+ * if (dl_defer_armed)
+ * dl_defer_armed = 0;
+ * __enqueue_dl_entity();
+ * // [D]
+ *
+ * // schedule server
+ * [4] D->A
+ * pick_task_dl()
+ * p = server_pick_task();
+ * if (!p)
+ * dl_server_stop()
+ * dequeue_dl_entity();
+ * hrtimer_try_to_cancel();
+ * dl_defer_armed = 0;
+ * dl_throttled = 0;
+ * dl_server_active = 0;
+ * // [A]
+ * return p;
+ *
+ * // server running
+ * [5] D->E
+ * update_curr_dl_se()
+ * if (dl_runtime_exceeded())
+ * dl_throttled = 1;
+ * dequeue_dl_entity();
+ * start_dl_timer();
+ * // [E]
+ *
+ * // server replenished
+ * [6] E->D
+ * dl_server_timer()
+ * enqueue_dl_entity(REPLENISH)
+ * replenish_dl_entity()
+ * fwd-period
+ * if (dl_throttled)
+ * dl_throttled = 0;
+ * __enqueue_dl_entity();
+ * // [D]
+ *
+ * // deplete server runtime from idle
+ * [7] B->C, C->C
+ * dl_server_update_idle()
+ * update_curr_dl_se() // idle = true
+ * if (dl_defer && dl_throttled && dl_runtime_exceeded())
+ * if (dl_defer_idle)
+ * return;
+ * dl_defer_running = 0;
+ * hrtimer_try_to_cancel();
+ * replenish_dl_new_period()
+ * // fwd period
+ * dl_throttled = 1;
+ * dl_defer_armed = 1;
+ * dl_defer_idle = 1;
+ * start_dl_timer(); // restart timer
+ * // [C]
+ *
+ * // stop idle server
+ * [8] C->A
+ * dl_server_timer()
+ * if (dl_defer_idle)
+ * dl_server_stop();
+ * // [A]
+ *
+ *
+ * digraph dl_server {
+ * "A:init" -> "B:zero_laxity-wait" [label="1:dl_server_start"]
+ * "A:init" -> "D:running" [label="1:dl_server_start"]
+ * "B:zero_laxity-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
+ * "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
+ * "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"]
+ * "C:idle-wait" -> "A:init" [label="8:dl_server_timer"]
+ * "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
+ * "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
+ * "D:running" -> "A:init" [label="4:pick_task_dl"]
+ * "D:running" -> "E:replenish-wait" [label="5:update_curr_dl_se"]
+ * "E:replenish-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
+ * "E:replenish-wait" -> "D:running" [label="6:dl_server_timer"]
+ * }
+ *
+ *
+ * Notes:
+ *
+ * - When there are fair tasks running the most likely loop is [2]->[2].
+ * the dl_server never actually runs, the timer never fires.
+ *
+ * - When there is actual fair starvation; the timer fires and starts the
+ * dl_server. This will then throttle and replenish like a normal DL
+ * task. Notably it will not 'defer' again.
+ *
+ * - When idle it will push the actication forward once, and then wait
+ * for the timer to hit or a non-idle update to restart things.
+ */
void dl_server_start(struct sched_dl_entity *dl_se)
{
struct rq *rq = dl_se->rq;
@@ -1582,6 +1787,14 @@ void dl_server_start(struct sched_dl_entity *dl_se)
if (!dl_server(dl_se) || dl_se->dl_server_active)
return;
+ /*
+ * Update the current task to 'now'.
+ */
+ rq->donor->sched_class->update_curr(rq);
+
+ if (WARN_ON_ONCE(!cpu_online(cpu_of(rq))))
+ return;
+
dl_se->dl_server_active = 1;
enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
@@ -1597,6 +1810,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
hrtimer_try_to_cancel(&dl_se->dl_timer);
dl_se->dl_defer_armed = 0;
dl_se->dl_throttled = 0;
+ dl_se->dl_defer_idle = 0;
dl_se->dl_server_active = 0;
}
@@ -1808,7 +2022,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (!dl_rq->dl_nr_running) {
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
- cpudl_clear(&rq->rd->cpudl, rq->cpu);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online);
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else {
struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
@@ -2045,7 +2259,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
* or "inactive")
*/
if (flags & DEQUEUE_SLEEP)
- task_non_contending(dl_se);
+ task_non_contending(dl_se, true);
}
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -2140,7 +2354,7 @@ static void yield_task_dl(struct rq *rq)
* it and the bandwidth timer will wake it up and will give it
* new scheduling parameters (thanks to dl_yielded=1).
*/
- rq->curr->dl.dl_yielded = 1;
+ rq->donor->dl.dl_yielded = 1;
update_rq_clock(rq);
update_curr_dl(rq);
@@ -2170,7 +2384,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
struct rq *rq;
if (!(flags & WF_TTWU))
- goto out;
+ return cpu;
rq = cpu_rq(cpu);
@@ -2208,7 +2422,6 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
}
rcu_read_unlock();
-out:
return cpu;
}
@@ -2352,7 +2565,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
* __pick_next_task_dl - Helper to pick the next -deadline task to run.
* @rq: The runqueue to pick the next task from.
*/
-static struct task_struct *__pick_task_dl(struct rq *rq)
+static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
@@ -2366,7 +2579,7 @@ again:
WARN_ON_ONCE(!dl_se);
if (dl_server(dl_se)) {
- p = dl_se->server_pick_task(dl_se);
+ p = dl_se->server_pick_task(dl_se, rf);
if (!p) {
dl_server_stop(dl_se);
goto again;
@@ -2379,9 +2592,9 @@ again:
return p;
}
-static struct task_struct *pick_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf)
{
- return __pick_task_dl(rq);
+ return __pick_task_dl(rq, rf);
}
static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
@@ -2880,9 +3093,10 @@ static void rq_online_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_set_overload(rq);
- cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
+ else
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, true);
}
/* Assumes rq->lock is held */
@@ -2891,8 +3105,7 @@ static void rq_offline_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_clear_overload(rq);
- cpudl_clear(&rq->rd->cpudl, rq->cpu);
- cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, false);
}
void __init init_sched_dl_class(void)
@@ -2970,7 +3183,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* will reset the task parameters.
*/
if (task_on_rq_queued(p) && p->dl.dl_runtime)
- task_non_contending(&p->dl);
+ task_non_contending(&p->dl, false);
/*
* In case a task is setscheduled out from SCHED_DEADLINE we need to
@@ -3042,23 +3255,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
}
}
+static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
+{
+ return p->dl.deadline;
+}
+
/*
* If the scheduling parameters of a -deadline task changed,
* a push or pull operation might be needed.
*/
-static void prio_changed_dl(struct rq *rq, struct task_struct *p,
- int oldprio)
+static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline)
{
if (!task_on_rq_queued(p))
return;
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
+ if (p->dl.deadline == old_deadline)
+ return;
+
+ if (dl_time_before(old_deadline, p->dl.deadline))
deadline_queue_pull_task(rq);
if (task_current_donor(rq, p)) {
@@ -3091,6 +3305,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
DEFINE_SCHED_CLASS(dl) = {
+ .queue_mask = 8,
+
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -3113,6 +3329,7 @@ DEFINE_SCHED_CLASS(dl) = {
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
+ .get_prio = get_prio_dl,
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02e16b70a790..41caa22e0680 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
last = __pick_last_entity(cfs_rq);
if (last)
right_vruntime = last->vruntime;
- min_vruntime = cfs_rq->min_vruntime;
+ zero_vruntime = cfs_rq->zero_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
SPLIT_NS(left_deadline));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
SPLIT_NS(left_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
- SPLIT_NS(min_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
+ SPLIT_NS(zero_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
SPLIT_NS(avg_vruntime(cfs_rq)));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2b0e88206d07..6827689a0966 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -25,7 +25,7 @@ static struct scx_sched __rcu *scx_root;
* guarantee system safety. Maintain a dedicated task list which contains every
* task between its fork and eventual free.
*/
-static DEFINE_SPINLOCK(scx_tasks_lock);
+static DEFINE_RAW_SPINLOCK(scx_tasks_lock);
static LIST_HEAD(scx_tasks);
/* ops enable/disable */
@@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
static struct delayed_work scx_watchdog_work;
-/* for %SCX_KICK_WAIT */
-static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
+/*
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
+ * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
+ * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
+ * lazily when enabling and freed when disabling to avoid waste when sched_ext
+ * isn't active.
+ */
+struct scx_kick_pseqs {
+ struct rcu_head rcu;
+ unsigned long seqs[];
+};
+
+static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
/*
* Direct dispatch marker.
@@ -465,7 +476,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
- spin_lock_irq(&scx_tasks_lock);
+ raw_spin_lock_irq(&scx_tasks_lock);
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
@@ -496,14 +507,14 @@ static void scx_task_iter_unlock(struct scx_task_iter *iter)
__scx_task_iter_rq_unlock(iter);
if (iter->list_locked) {
iter->list_locked = false;
- spin_unlock_irq(&scx_tasks_lock);
+ raw_spin_unlock_irq(&scx_tasks_lock);
}
}
static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
{
if (!iter->list_locked) {
- spin_lock_irq(&scx_tasks_lock);
+ raw_spin_lock_irq(&scx_tasks_lock);
iter->list_locked = true;
}
}
@@ -780,13 +791,23 @@ static void schedule_deferred(struct rq *rq)
if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
return;
+ /* Don't do anything if there already is a deferred operation. */
+ if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING)
+ return;
+
/*
* If in balance, the balance callbacks will be called before rq lock is
* released. Schedule one.
+ *
+ *
+ * We can't directly insert the callback into the
+ * rq's list: The call can drop its lock and make the pending balance
+ * callback visible to unrelated code paths that call rq_pin_lock().
+ *
+ * Just let balance_one() know that it must do it itself.
*/
if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
- queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
- deferred_bal_cb_workfn);
+ rq->scx.flags |= SCX_RQ_BAL_CB_PENDING;
return;
}
@@ -1453,7 +1474,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
static void yield_task_scx(struct rq *rq)
{
struct scx_sched *sch = scx_root;
- struct task_struct *p = rq->curr;
+ struct task_struct *p = rq->donor;
if (SCX_HAS_OP(sch, yield))
SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
@@ -1464,7 +1485,7 @@ static void yield_task_scx(struct rq *rq)
static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
{
struct scx_sched *sch = scx_root;
- struct task_struct *from = rq->curr;
+ struct task_struct *from = rq->donor;
if (SCX_HAS_OP(sch, yield))
return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
@@ -2003,6 +2024,19 @@ static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
dspc->cursor = 0;
}
+static inline void maybe_queue_balance_callback(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+ if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING))
+ return;
+
+ queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
+ deferred_bal_cb_workfn);
+
+ rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
+}
+
static int balance_one(struct rq *rq, struct task_struct *prev)
{
struct scx_sched *sch = scx_root;
@@ -2013,7 +2047,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
- rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
+ rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) {
@@ -2119,40 +2153,6 @@ has_tasks:
return true;
}
-static int balance_scx(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf)
-{
- int ret;
-
- rq_unpin_lock(rq, rf);
-
- ret = balance_one(rq, prev);
-
-#ifdef CONFIG_SCHED_SMT
- /*
- * When core-sched is enabled, this ops.balance() call will be followed
- * by pick_task_scx() on this CPU and the SMT siblings. Balance the
- * siblings too.
- */
- if (sched_core_enabled(rq)) {
- const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
- int scpu;
-
- for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
- struct rq *srq = cpu_rq(scpu);
- struct task_struct *sprev = srq->curr;
-
- WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
- update_rq_clock(srq);
- balance_one(srq, sprev);
- }
- }
-#endif
- rq_repin_lock(rq, rf);
-
- return ret;
-}
-
static void process_ddsp_deferred_locals(struct rq *rq)
{
struct task_struct *p;
@@ -2332,41 +2332,23 @@ static struct task_struct *first_local_task(struct rq *rq)
struct task_struct, scx.dsq_list.node);
}
-static struct task_struct *pick_task_scx(struct rq *rq)
+static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
{
struct task_struct *prev = rq->curr;
+ bool keep_prev, kick_idle = false;
struct task_struct *p;
- bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
- bool kick_idle = false;
- /*
- * WORKAROUND:
- *
- * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
- * have gone through balance_scx(). Unfortunately, there currently is a
- * bug where fair could say yes on balance() but no on pick_task(),
- * which then ends up calling pick_task_scx() without preceding
- * balance_scx().
- *
- * Keep running @prev if possible and avoid stalling from entering idle
- * without balancing.
- *
- * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
- * if pick_task_scx() is called without preceding balance_scx().
- */
- if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
- if (prev->scx.flags & SCX_TASK_QUEUED) {
- keep_prev = true;
- } else {
- keep_prev = false;
- kick_idle = true;
- }
- } else if (unlikely(keep_prev &&
- prev->sched_class != &ext_sched_class)) {
- /*
- * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
- * conditional on scx_enabled() and may have been skipped.
- */
+ rq_modified_clear(rq);
+ rq_unpin_lock(rq, rf);
+ balance_one(rq, prev);
+ rq_repin_lock(rq, rf);
+ maybe_queue_balance_callback(rq);
+ if (rq_modified_above(rq, &ext_sched_class))
+ return RETRY_TASK;
+
+ keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ if (unlikely(keep_prev &&
+ prev->sched_class != &ext_sched_class)) {
WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
@@ -2904,9 +2886,9 @@ void scx_post_fork(struct task_struct *p)
}
}
- spin_lock_irq(&scx_tasks_lock);
+ raw_spin_lock_irq(&scx_tasks_lock);
list_add_tail(&p->scx.tasks_node, &scx_tasks);
- spin_unlock_irq(&scx_tasks_lock);
+ raw_spin_unlock_irq(&scx_tasks_lock);
percpu_up_read(&scx_fork_rwsem);
}
@@ -2930,9 +2912,9 @@ void sched_ext_free(struct task_struct *p)
{
unsigned long flags;
- spin_lock_irqsave(&scx_tasks_lock, flags);
+ raw_spin_lock_irqsave(&scx_tasks_lock, flags);
list_del_init(&p->scx.tasks_node);
- spin_unlock_irqrestore(&scx_tasks_lock, flags);
+ raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
/*
* @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
@@ -2961,7 +2943,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
p, p->scx.weight);
}
-static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
{
}
@@ -3234,6 +3216,8 @@ static void scx_cgroup_unlock(void) {}
* their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
+ .queue_mask = 1,
+
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
@@ -3241,7 +3225,6 @@ DEFINE_SCHED_CLASS(ext) = {
.wakeup_preempt = wakeup_preempt_scx,
- .balance = balance_scx,
.pick_task = pick_task_scx,
.put_prev_task = put_prev_task_scx,
@@ -3471,7 +3454,9 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
struct scx_dispatch_q *dsq;
int node;
+ irq_work_sync(&sch->error_irq_work);
kthread_stop(sch->helper->task);
+
free_percpu(sch->pcpu);
for_each_node_state(node, N_POSSIBLE)
@@ -3780,11 +3765,10 @@ static void scx_bypass(bool bypass)
*/
list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
scx.runnable_node) {
- struct sched_enq_and_set_ctx ctx;
-
/* cycling deq/enq is enough, see the function comment */
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /* nothing */ ;
+ }
}
/* resched to restore ticks and idle state */
@@ -3850,6 +3834,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
}
}
+static void free_kick_pseqs_rcu(struct rcu_head *rcu)
+{
+ struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
+
+ kvfree(pseqs);
+}
+
+static void free_kick_pseqs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+ struct scx_kick_pseqs *to_free;
+
+ to_free = rcu_replace_pointer(*pseqs, NULL, true);
+ if (to_free)
+ call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
+ }
+}
+
static void scx_disable_workfn(struct kthread_work *work)
{
struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
@@ -3913,22 +3918,20 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
-
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ update_rq_clock(task_rq(p));
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
+ if (old_class != new_class)
+ queue_flags |= DEQUEUE_CLASS;
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, queue_flags) {
+ p->sched_class = new_class;
+ }
- check_class_changed(task_rq(p), p, old_class, p->prio);
scx_exit_task(p);
}
scx_task_iter_stop(&sti);
@@ -3986,6 +3989,7 @@ static void scx_disable_workfn(struct kthread_work *work)
free_percpu(scx_dsp_ctx);
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
+ free_kick_pseqs();
mutex_unlock(&scx_enable_mutex);
@@ -4216,7 +4220,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
size_t avail, used;
bool idle;
- rq_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
idle = list_empty(&rq->scx.runnable_list) &&
rq->curr->sched_class == &idle_sched_class;
@@ -4285,7 +4289,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
scx_dump_task(&s, &dctx, p, ' ');
next:
- rq_unlock(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
}
dump_newline(&s);
@@ -4348,6 +4352,33 @@ static void scx_vexit(struct scx_sched *sch,
irq_work_queue(&sch->error_irq_work);
}
+static int alloc_kick_pseqs(void)
+{
+ int cpu;
+
+ /*
+ * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
+ * can exceed percpu allocator limits on large machines.
+ */
+ for_each_possible_cpu(cpu) {
+ struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+ struct scx_kick_pseqs *new_pseqs;
+
+ WARN_ON_ONCE(rcu_access_pointer(*pseqs));
+
+ new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!new_pseqs) {
+ free_kick_pseqs();
+ return -ENOMEM;
+ }
+
+ rcu_assign_pointer(*pseqs, new_pseqs);
+ }
+
+ return 0;
+}
+
static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
{
struct scx_sched *sch;
@@ -4392,8 +4423,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
goto err_free_gdsqs;
sch->helper = kthread_run_worker(0, "sched_ext_helper");
- if (!sch->helper)
+ if (IS_ERR(sch->helper)) {
+ ret = PTR_ERR(sch->helper);
goto err_free_pcpu;
+ }
+
sched_set_fifo(sch->helper->task);
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
@@ -4495,10 +4529,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_unlock;
}
+ ret = alloc_kick_pseqs();
+ if (ret)
+ goto err_unlock;
+
sch = scx_alloc_and_add_sched(ops);
if (IS_ERR(sch)) {
ret = PTR_ERR(sch);
- goto err_unlock;
+ goto err_free_pseqs;
}
/*
@@ -4657,26 +4695,22 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
percpu_down_write(&scx_fork_rwsem);
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
if (!tryget_task_struct(p))
continue;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ if (old_class != new_class)
+ queue_flags |= DEQUEUE_CLASS;
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-
- p->scx.slice = SCX_SLICE_DFL;
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
-
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, queue_flags) {
+ p->scx.slice = SCX_SLICE_DFL;
+ p->sched_class = new_class;
+ }
- check_class_changed(task_rq(p), p, old_class, p->prio);
put_task_struct(p);
}
scx_task_iter_stop(&sti);
@@ -4701,6 +4735,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return 0;
+err_free_pseqs:
+ free_kick_pseqs();
err_unlock:
mutex_unlock(&scx_enable_mutex);
return ret;
@@ -5082,10 +5118,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
{
struct rq *this_rq = this_rq();
struct scx_rq *this_scx = &this_rq->scx;
- unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+ struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
bool should_wait = false;
+ unsigned long *pseqs;
s32 cpu;
+ if (unlikely(!pseqs_pcpu)) {
+ pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
+ return;
+ }
+
+ pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
+
for_each_cpu(cpu, this_scx->cpus_to_kick) {
should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
@@ -5208,11 +5252,6 @@ void __init init_sched_ext_class(void)
scx_idle_init_masks();
- scx_kick_cpus_pnt_seqs =
- __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
- __alignof__(scx_kick_cpus_pnt_seqs[0]));
- BUG_ON(!scx_kick_cpus_pnt_seqs);
-
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
int n = cpu_to_node(cpu);
@@ -5225,8 +5264,8 @@ void __init init_sched_ext_class(void)
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
- init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
- init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
+ rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
+ rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
if (cpu_online(cpu))
cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
@@ -5688,8 +5727,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
@@ -5820,8 +5859,8 @@ __bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
@@ -6305,7 +6344,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
guard(rcu)();
- sch = rcu_dereference(sch);
+ sch = rcu_dereference(scx_root);
if (unlikely(!sch))
return;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bc0b7ce8a65d..769d7b7990df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a,
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->min_vruntime);
+ return (s64)(se->vruntime - cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* Which we track using:
*
- * v0 := cfs_rq->min_vruntime
+ * v0 := cfs_rq->zero_vruntime
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
* \Sum w_i := cfs_rq->avg_load
*
- * Since min_vruntime is a monotonic increasing variable that closely tracks
- * the per-task service, these deltas: (v_i - v), will be in the order of the
- * maximal (virtual) lag induced in the system due to quantisation.
+ * Since zero_vruntime closely tracks the per-task service, these
+ * deltas: (v_i - v), will be in the order of the maximal (virtual) lag
+ * induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
*
@@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
avg = div_s64(avg, load);
}
- return cfs_rq->min_vruntime + avg;
+ return cfs_rq->zero_vruntime + avg;
}
/*
@@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+ return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
- u64 min_vruntime = cfs_rq->min_vruntime;
- /*
- * open coded max_vruntime() to allow updating avg_vruntime
- */
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta > 0) {
- avg_vruntime_update(cfs_rq, delta);
- min_vruntime = vruntime;
- }
- return min_vruntime;
-}
-
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
-{
- struct sched_entity *se = __pick_root_entity(cfs_rq);
- struct sched_entity *curr = cfs_rq->curr;
- u64 vruntime = cfs_rq->min_vruntime;
-
- if (curr) {
- if (curr->on_rq)
- vruntime = curr->vruntime;
- else
- curr = NULL;
- }
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
- if (se) {
- if (!curr)
- vruntime = se->min_vruntime;
- else
- vruntime = min_vruntime(vruntime, se->min_vruntime);
- }
+ avg_vruntime_update(cfs_rq, delta);
- /* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
+ cfs_rq->zero_vruntime = vruntime;
}
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
@@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
avg_vruntime_sub(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -955,6 +929,16 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
if (cfs_rq->nr_queued == 1)
return curr && curr->on_rq ? curr : se;
+ /*
+ * Picking the ->next buddy will affect latency but not fairness.
+ */
+ if (sched_feat(PICK_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+ /* ->next will never be delayed */
+ WARN_ON_ONCE(cfs_rq->next->sched_delayed);
+ return cfs_rq->next;
+ }
+
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
@@ -1193,6 +1177,8 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
return delta_exec;
}
+static void set_next_buddy(struct sched_entity *se);
+
/*
* Used by other classes to account runtime.
*/
@@ -1226,7 +1212,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
curr->vruntime += calc_delta_fair(delta_exec, curr);
resched = update_deadline(cfs_rq, curr);
- update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
/*
@@ -1239,8 +1224,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
* against fair_server such that it can account for this time
* and possibly avoid running this period.
*/
- if (dl_server_active(&rq->fair_server))
- dl_server_update(&rq->fair_server, delta_exec);
+ dl_server_update(&rq->fair_server, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -3808,15 +3792,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
if (!curr)
__enqueue_entity(cfs_rq, se);
cfs_rq->nr_queued++;
-
- /*
- * The entity's vruntime has been adjusted, so let's check
- * whether the rq-wide min_vruntime needs updated too. Since
- * the calculations above require stable min_vruntime rather
- * than up-to-date one, we do the update at the end of the
- * reweight process.
- */
- update_min_vruntime(cfs_rq);
}
}
@@ -5429,15 +5404,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_cfs_group(se);
- /*
- * Now advance min_vruntime if @se was the entity holding it back,
- * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
- * put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- i.e. we'll be penalized.
- */
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
- update_min_vruntime(cfs_rq);
-
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
@@ -5512,16 +5478,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
struct sched_entity *se;
- /*
- * Picking the ->next buddy will affect latency but not fairness.
- */
- if (sched_feat(PICK_BUDDY) &&
- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
- /* ->next will never be delayed */
- WARN_ON_ONCE(cfs_rq->next->sched_delayed);
- return cfs_rq->next;
- }
-
se = pick_eevdf(cfs_rq);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
@@ -6024,20 +5980,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
/*
- * It's possible we are called with !runtime_remaining due to things
- * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
- * unthrottled us with a positive runtime_remaining but other still
- * running entities consumed those runtime before we reached here.
+ * It's possible we are called with runtime_remaining < 0 due to things
+ * like async unthrottled us with a positive runtime_remaining but other
+ * still running entities consumed those runtime before we reached here.
*
- * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
- * because any enqueue in tg_unthrottle_up() will immediately trigger a
- * throttle, which is not supposed to happen on unthrottle path.
+ * We can't unthrottle this cfs_rq without any runtime remaining because
+ * any enqueue in tg_unthrottle_up() will immediately trigger a throttle,
+ * which is not supposed to happen on unthrottle path.
*/
if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
return;
- se = cfs_rq->tg->se[cpu_of(rq)];
-
cfs_rq->throttled = 0;
update_rq_clock(rq);
@@ -6437,6 +6390,16 @@ static void sync_throttle(struct task_group *tg, int cpu)
cfs_rq->throttle_count = pcfs_rq->throttle_count;
cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
+
+ /*
+ * It is not enough to sync the "pelt_clock_throttled" indicator
+ * with the parent cfs_rq when the hierarchy is not queued.
+ * Always join a throttled hierarchy with PELT clock throttled
+ * and leaf it to the first enqueue, or distribution to
+ * unthrottle the PELT clock.
+ */
+ if (cfs_rq->throttle_count)
+ cfs_rq->pelt_clock_throttled = 1;
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -6996,12 +6959,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
h_nr_idle = 1;
}
- if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
- /* Account for idle runtime */
- if (!rq->nr_running)
- dl_server_update_idle_time(rq, rq->curr);
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
- }
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
@@ -7028,8 +6987,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
hrtick_update(rq);
}
-static void set_next_buddy(struct sched_entity *se);
-
/*
* Basically dequeue_task_fair(), except it can deal with dequeue_entity()
* failing half-way through and resume the dequeue later.
@@ -8705,15 +8662,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context
set_task_max_allowed_capacity(p);
}
-static int
-balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
-{
- if (sched_fair_runnable(rq))
- return 1;
-
- return sched_balance_newidle(rq, rf) != 0;
-}
-
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
@@ -8725,16 +8673,81 @@ static void set_next_buddy(struct sched_entity *se)
}
}
+enum preempt_wakeup_action {
+ PREEMPT_WAKEUP_NONE, /* No preemption. */
+ PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */
+ PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */
+ PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */
+};
+
+static inline bool
+set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags,
+ struct sched_entity *pse, struct sched_entity *se)
+{
+ /*
+ * Keep existing buddy if the deadline is sooner than pse.
+ * The older buddy may be cache cold and completely unrelated
+ * to the current wakeup but that is unpredictable where as
+ * obeying the deadline is more in line with EEVDF objectives.
+ */
+ if (cfs_rq->next && entity_before(cfs_rq->next, pse))
+ return false;
+
+ set_next_buddy(pse);
+ return true;
+}
+
+/*
+ * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
+ * strictly enforced because the hint is either misunderstood or
+ * multiple tasks must be woken up.
+ */
+static inline enum preempt_wakeup_action
+preempt_sync(struct rq *rq, int wake_flags,
+ struct sched_entity *pse, struct sched_entity *se)
+{
+ u64 threshold, delta;
+
+ /*
+ * WF_SYNC without WF_TTWU is not expected so warn if it happens even
+ * though it is likely harmless.
+ */
+ WARN_ON_ONCE(!(wake_flags & WF_TTWU));
+
+ threshold = sysctl_sched_migration_cost;
+ delta = rq_clock_task(rq) - se->exec_start;
+ if ((s64)delta < 0)
+ delta = 0;
+
+ /*
+ * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they
+ * could run on other CPUs. Reduce the threshold before preemption is
+ * allowed to an arbitrary lower value as it is more likely (but not
+ * guaranteed) the waker requires the wakee to finish.
+ */
+ if (wake_flags & WF_RQ_SELECTED)
+ threshold >>= 2;
+
+ /*
+ * As WF_SYNC is not strictly obeyed, allow some runtime for batch
+ * wakeups to be issued.
+ */
+ if (entity_before(pse, se) && delta >= threshold)
+ return PREEMPT_WAKEUP_RESCHED;
+
+ return PREEMPT_WAKEUP_NONE;
+}
+
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
+ enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
struct sched_entity *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
- bool do_preempt_short = false;
if (unlikely(se == pse))
return;
@@ -8748,10 +8761,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
if (task_is_throttled(p))
return;
- if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
- set_next_buddy(pse);
- }
-
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
@@ -8783,7 +8792,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* When non-idle entity preempt an idle entity,
* don't give idle entity slice protection.
*/
- do_preempt_short = true;
+ preempt_action = PREEMPT_WAKEUP_SHORT;
goto preempt;
}
@@ -8802,27 +8811,74 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* If @p has a shorter slice than current and @p is eligible, override
* current's slice protection in order to allow preemption.
*/
- do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
+ if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
+ preempt_action = PREEMPT_WAKEUP_SHORT;
+ goto pick;
+ }
+
+ /*
+ * Ignore wakee preemption on WF_FORK as it is less likely that
+ * there is shared data as exec often follow fork. Do not
+ * preempt for tasks that are sched_delayed as it would violate
+ * EEVDF to forcibly queue an ineligible task.
+ */
+ if ((wake_flags & WF_FORK) || pse->sched_delayed)
+ return;
/*
+ * If @p potentially is completing work required by current then
+ * consider preemption.
+ *
+ * Reschedule if waker is no longer eligible. */
+ if (in_task() && !entity_eligible(cfs_rq, se)) {
+ preempt_action = PREEMPT_WAKEUP_RESCHED;
+ goto preempt;
+ }
+
+ /* Prefer picking wakee soon if appropriate. */
+ if (sched_feat(NEXT_BUDDY) &&
+ set_preempt_buddy(cfs_rq, wake_flags, pse, se)) {
+
+ /*
+ * Decide whether to obey WF_SYNC hint for a new buddy. Old
+ * buddies are ignored as they may not be relevant to the
+ * waker and less likely to be cache hot.
+ */
+ if (wake_flags & WF_SYNC)
+ preempt_action = preempt_sync(rq, wake_flags, pse, se);
+ }
+
+ switch (preempt_action) {
+ case PREEMPT_WAKEUP_NONE:
+ return;
+ case PREEMPT_WAKEUP_RESCHED:
+ goto preempt;
+ case PREEMPT_WAKEUP_SHORT:
+ fallthrough;
+ case PREEMPT_WAKEUP_PICK:
+ break;
+ }
+
+pick:
+ /*
* If @p has become the most eligible task, force preemption.
*/
- if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
+ if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
goto preempt;
- if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
+ if (sched_feat(RUN_TO_PARITY))
update_protect_slice(cfs_rq, se);
return;
preempt:
- if (do_preempt_short)
+ if (preempt_action == PREEMPT_WAKEUP_SHORT)
cancel_protect_slice(se);
resched_curr_lazy(rq);
}
-static struct task_struct *pick_task_fair(struct rq *rq)
+static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
@@ -8866,7 +8922,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
- p = pick_task_fair(rq);
+ p = pick_task_fair(rq, rf);
if (!p)
goto idle;
se = &p->se;
@@ -8920,21 +8976,21 @@ simple:
return p;
idle:
- if (!rf)
- return NULL;
-
- new_tasks = sched_balance_newidle(rq, rf);
+ if (rf) {
+ new_tasks = sched_balance_newidle(rq, rf);
- /*
- * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
- * possible for any higher priority task to appear. In that case we
- * must re-start the pick_next_entity() loop.
- */
- if (new_tasks < 0)
- return RETRY_TASK;
+ /*
+ * Because sched_balance_newidle() releases (and re-acquires)
+ * rq->lock, it is possible for any higher priority task to
+ * appear. In that case we must re-start the pick_next_entity()
+ * loop.
+ */
+ if (new_tasks < 0)
+ return RETRY_TASK;
- if (new_tasks > 0)
- goto again;
+ if (new_tasks > 0)
+ goto again;
+ }
/*
* rq is about to be idle, check if we need to update the
@@ -8945,14 +9001,10 @@ idle:
return NULL;
}
-static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+static struct task_struct *
+fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
{
- return pick_next_task_fair(rq, prev, NULL);
-}
-
-static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
-{
- return pick_task_fair(dl_se->rq);
+ return pick_task_fair(dl_se->rq, rf);
}
void fair_server_init(struct rq *rq)
@@ -8983,7 +9035,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
*/
static void yield_task_fair(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *curr = rq->donor;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se;
@@ -9007,7 +9059,18 @@ static void yield_task_fair(struct rq *rq)
*/
rq_clock_skip_update(rq);
- se->deadline += calc_delta_fair(se->slice, se);
+ /*
+ * Forfeit the remaining vruntime, only if the entity is eligible. This
+ * condition is necessary because in core scheduling we prefer to run
+ * ineligible tasks rather than force idling. If this happens we may
+ * end up in a loop where the core scheduler picks the yielding task,
+ * which yields immediately again; without the condition the vruntime
+ * ends up quickly running away.
+ */
+ if (entity_eligible(cfs_rq, se)) {
+ se->vruntime = se->deadline;
+ se->deadline += calc_delta_fair(se->slice, se);
+ }
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
@@ -10671,7 +10734,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
if (sd->flags & SD_ASYM_CPUCAPACITY)
sgs->group_misfit_task_load = 1;
- for_each_cpu(i, sched_group_span(group)) {
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
struct rq *rq = cpu_rq(i);
unsigned int local;
@@ -11723,6 +11786,21 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
}
/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ * is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ * execution, as non-SD_SERIALIZE domains will still be
+ * load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
+
+/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
@@ -11747,6 +11825,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
};
+ bool need_unlock = false;
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
@@ -11758,6 +11837,14 @@ redo:
goto out_balanced;
}
+ if (!need_unlock && (sd->flags & SD_SERIALIZE)) {
+ int zero = 0;
+ if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1))
+ goto out_balanced;
+
+ need_unlock = true;
+ }
+
group = sched_balance_find_src_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
@@ -11998,6 +12085,9 @@ out_one_pinned:
sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
out:
+ if (need_unlock)
+ atomic_set_release(&sched_balance_running, 0);
+
return ld_moved;
}
@@ -12123,21 +12213,6 @@ out_unlock:
}
/*
- * This flag serializes load-balancing passes over large domains
- * (above the NODE topology level) - only one load-balancing instance
- * may run at a time, to reduce overhead on very large systems with
- * lots of CPUs and large NUMA distances.
- *
- * - Note that load-balancing passes triggered while another one
- * is executing are skipped and not re-tried.
- *
- * - Also note that this does not serialize rebalance_domains()
- * execution, as non-SD_SERIALIZE domains will still be
- * load-balanced in parallel.
- */
-static atomic_t sched_balance_running = ATOMIC_INIT(0);
-
-/*
* Scale the max sched_balance_rq interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
@@ -12146,30 +12221,43 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+ sd->newidle_call++;
+ sd->newidle_success += success;
+
+ if (sd->newidle_call >= 1024) {
+ sd->newidle_ratio = sd->newidle_success;
+ sd->newidle_call /= 2;
+ sd->newidle_success /= 2;
+ }
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
{
+ unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
+ unsigned long now = jiffies;
+
+ if (cost)
+ update_newidle_stats(sd, success);
+
if (cost > sd->max_newidle_lb_cost) {
/*
* Track max cost of a domain to make sure to not delay the
* next wakeup on the CPU.
- *
- * sched_balance_newidle() bumps the cost whenever newidle
- * balance fails, and we don't want things to grow out of
- * control. Use the sysctl_sched_migration_cost as the upper
- * limit, plus a litle extra to avoid off by ones.
*/
- sd->max_newidle_lb_cost =
- min(cost, sysctl_sched_migration_cost + 200);
- sd->last_decay_max_lb_cost = jiffies;
- } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
+ sd->max_newidle_lb_cost = cost;
+ sd->last_decay_max_lb_cost = now;
+
+ } else if (time_after(now, next_decay)) {
/*
* Decay the newidle max times by ~1% per second to ensure that
* it is not outdated and the current max cost is actually
* shorter.
*/
sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
- sd->last_decay_max_lb_cost = jiffies;
-
+ sd->last_decay_max_lb_cost = now;
return true;
}
@@ -12192,7 +12280,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
- int need_serialize, need_decay = 0;
+ int need_decay = 0;
u64 max_cost = 0;
rcu_read_lock();
@@ -12201,7 +12289,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* Decay the newidle max times here because this is a regular
* visit to all the domains.
*/
- need_decay = update_newidle_cost(sd, 0);
+ need_decay = update_newidle_cost(sd, 0, 0);
max_cost += sd->max_newidle_lb_cost;
/*
@@ -12216,13 +12304,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
}
interval = get_sd_balance_interval(sd, busy);
-
- need_serialize = sd->flags & SD_SERIALIZE;
- if (need_serialize) {
- if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
- goto out;
- }
-
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
/*
@@ -12236,9 +12317,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
- if (need_serialize)
- atomic_set_release(&sched_balance_running, 0);
-out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
@@ -12817,18 +12895,21 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ goto out;
+ }
if (!get_rd_overloaded(this_rq->rd) ||
- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+ this_rq->avg_idle < sd->max_newidle_lb_cost) {
- if (sd)
- update_next_balance(sd, &next_balance);
+ update_next_balance(sd, &next_balance);
rcu_read_unlock();
-
goto out;
}
rcu_read_unlock();
+ rq_modified_clear(this_rq);
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
@@ -12844,6 +12925,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
break;
if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned int weight = 1;
+
+ if (sched_feat(NI_RANDOM)) {
+ /*
+ * Throw a 1k sided dice; and only run
+ * newidle_balance according to the success
+ * rate.
+ */
+ u32 d1k = sched_rng() % 1024;
+ weight = 1 + sd->newidle_ratio;
+ if (d1k > weight) {
+ update_newidle_stats(sd, 0);
+ continue;
+ }
+ weight = (1024 + weight/2) / weight;
+ }
pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
@@ -12855,13 +12952,10 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t0 = t1;
/*
- * Failing newidle means it is not effective;
- * bump the cost so we end up doing less of it.
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
*/
- if (!pulled_task)
- domain_cost = (3 * sd->max_newidle_lb_cost) / 2;
-
- update_newidle_cost(sd, domain_cost);
+ update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
}
/*
@@ -12886,8 +12980,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
+ /* If a higher prio class was modified, restart the pick */
+ if (rq_modified_above(this_rq, &fair_sched_class))
pulled_task = -1;
out:
@@ -13005,7 +13099,170 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
}
/*
- * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ * Consider any infeasible weight scenario. Take for instance two tasks,
+ * each bound to their respective sibling, one with weight 1 and one with
+ * weight 2. Then the lower weight task will run ahead of the higher weight
+ * task without bound.
+ *
+ * This utterly destroys the concept of a shared time base.
+ *
+ * Remember; all this is about a proportionally fair scheduling, where each
+ * tasks receives:
+ *
+ * w_i
+ * dt_i = ---------- dt (1)
+ * \Sum_j w_j
+ *
+ * which we do by tracking a virtual time, s_i:
+ *
+ * 1
+ * s_i = --- d[t]_i (2)
+ * w_i
+ *
+ * Where d[t] is a delta of discrete time, while dt is an infinitesimal.
+ * The immediate corollary is that the ideal schedule S, where (2) to use
+ * an infinitesimal delta, is:
+ *
+ * 1
+ * S = ---------- dt (3)
+ * \Sum_i w_i
+ *
+ * From which we can define the lag, or deviation from the ideal, as:
+ *
+ * lag(i) = S - s_i (4)
+ *
+ * And since the one and only purpose is to approximate S, we get that:
+ *
+ * \Sum_i w_i lag(i) := 0 (5)
+ *
+ * If this were not so, we no longer converge to S, and we can no longer
+ * claim our scheduler has any of the properties we derive from S. This is
+ * exactly what you did above, you broke it!
+ *
+ *
+ * Let's continue for a while though; to see if there is anything useful to
+ * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i:
+ *
+ * \Sum_i w_i s_i
+ * S = -------------- (6)
+ * \Sum_i w_i
+ *
+ * Which gives us a way to compute S, given our s_i. Now, if you've read
+ * our code, you know that we do not in fact do this, the reason for this
+ * is two-fold. Firstly, computing S in that way requires a 64bit division
+ * for every time we'd use it (see 12), and secondly, this only describes
+ * the steady-state, it doesn't handle dynamics.
+ *
+ * Anyway, in (6): s_i -> x + (s_i - x), to get:
+ *
+ * \Sum_i w_i (s_i - x)
+ * S - x = -------------------- (7)
+ * \Sum_i w_i
+ *
+ * Which shows that S and s_i transform alike (which makes perfect sense
+ * given that S is basically the (weighted) average of s_i).
+ *
+ * So the thing to remember is that the above is strictly UP. It is
+ * possible to generalize to multiple runqueues -- however it gets really
+ * yuck when you have to add affinity support, as illustrated by our very
+ * first counter-example.
+ *
+ * Luckily I think we can avoid needing a full multi-queue variant for
+ * core-scheduling (or load-balancing). The crucial observation is that we
+ * only actually need this comparison in the presence of forced-idle; only
+ * then do we need to tell if the stalled rq has higher priority over the
+ * other.
+ *
+ * [XXX assumes SMT2; better consider the more general case, I suspect
+ * it'll work out because our comparison is always between 2 rqs and the
+ * answer is only interesting if one of them is forced-idle]
+ *
+ * And (under assumption of SMT2) when there is forced-idle, there is only
+ * a single queue, so everything works like normal.
+ *
+ * Let, for our runqueue 'k':
+ *
+ * T_k = \Sum_i w_i s_i
+ * W_k = \Sum_i w_i ; for all i of k (8)
+ *
+ * Then we can write (6) like:
+ *
+ * T_k
+ * S_k = --- (9)
+ * W_k
+ *
+ * From which immediately follows that:
+ *
+ * T_k + T_l
+ * S_k+l = --------- (10)
+ * W_k + W_l
+ *
+ * On which we can define a combined lag:
+ *
+ * lag_k+l(i) := S_k+l - s_i (11)
+ *
+ * And that gives us the tools to compare tasks across a combined runqueue.
+ *
+ *
+ * Combined this gives the following:
+ *
+ * a) when a runqueue enters force-idle, sync it against it's sibling rq(s)
+ * using (7); this only requires storing single 'time'-stamps.
+ *
+ * b) when comparing tasks between 2 runqueues of which one is forced-idle,
+ * compare the combined lag, per (11).
+ *
+ * Now, of course cgroups (I so hate them) make this more interesting in
+ * that a) seems to suggest we need to iterate all cgroup on a CPU at such
+ * boundaries, but I think we can avoid that. The force-idle is for the
+ * whole CPU, all it's rqs. So we can mark it in the root and lazily
+ * propagate downward on demand.
+ */
+
+/*
+ * So this sync is basically a relative reset of S to 0.
+ *
+ * So with 2 queues, when one goes idle, we drop them both to 0 and one
+ * then increases due to not being idle, and the idle one builds up lag to
+ * get re-elected. So far so simple, right?
+ *
+ * When there's 3, we can have the situation where 2 run and one is idle,
+ * we sync to 0 and let the idle one build up lag to get re-election. Now
+ * suppose another one also drops idle. At this point dropping all to 0
+ * again would destroy the built-up lag from the queue that was already
+ * idle, not good.
+ *
+ * So instead of syncing everything, we can:
+ *
+ * less := !((s64)(s_a - s_b) <= 0)
+ *
+ * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b
+ * == v_a - (v_b - S_a + S_b)
+ *
+ * IOW, we can recast the (lag) comparison to a one-sided difference.
+ * So if then, instead of syncing the whole queue, sync the idle queue
+ * against the active queue with S_a + S_b at the point where we sync.
+ *
+ * (XXX consider the implication of living in a cyclic group: N / 2^n N)
+ *
+ * This gives us means of syncing single queues against the active queue,
+ * and for already idle queues to preserve their build-up lag.
+ *
+ * Of course, then we get the situation where there's 2 active and one
+ * going idle, who do we pick to sync against? Theory would have us sync
+ * against the combined S, but as we've already demonstrated, there is no
+ * such thing in infeasible weight scenarios.
+ *
+ * One thing I've considered; and this is where that core_active rudiment
+ * came from, is having active queues sync up between themselves after
+ * every tick. This limits the observed divergence due to the work
+ * conservancy.
+ *
+ * On top of that, we can improve upon things by employing (10) here.
+ */
+
+/*
+ * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed.
*/
static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
bool forceidle)
@@ -13019,7 +13276,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
cfs_rq->forceidle_seq = fi_seq;
}
- cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime;
}
}
@@ -13072,11 +13329,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
/*
* Find delta after normalizing se's vruntime with its cfs_rq's
- * min_vruntime_fi, which would have been updated in prior calls
+ * zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+ (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13138,11 +13395,14 @@ static void task_fork_fair(struct task_struct *p)
* the current task.
*/
static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
+ if (p->prio == oldprio)
+ return;
+
if (rq->cfs.nr_queued == 1)
return;
@@ -13154,8 +13414,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (task_current_donor(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
- } else
+ } else {
wakeup_preempt(rq, p, 0);
+ }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -13187,6 +13448,8 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
if (!cfs_rq_pelt_clock_throttled(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
+
+ assert_list_leaf_cfs_rq(rq_of(cfs_rq));
}
#else /* !CONFIG_FAIR_GROUP_SCHED: */
static void propagate_entity_cfs_rq(struct sched_entity *se) { }
@@ -13237,6 +13500,12 @@ static void attach_task_cfs_rq(struct task_struct *p)
attach_entity_cfs_rq(se);
}
+static void switching_from_fair(struct rq *rq, struct task_struct *p)
+{
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+}
+
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -13310,7 +13579,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+ cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
raw_spin_lock_init(&cfs_rq->removed.lock);
}
@@ -13611,6 +13880,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
*/
DEFINE_SCHED_CLASS(fair) = {
+ .queue_mask = 2,
+
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
@@ -13619,11 +13890,10 @@ DEFINE_SCHED_CLASS(fair) = {
.wakeup_preempt = check_preempt_wakeup_fair,
.pick_task = pick_task_fair,
- .pick_next_task = __pick_next_task_fair,
+ .pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
- .balance = balance_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
@@ -13638,6 +13908,7 @@ DEFINE_SCHED_CLASS(fair) = {
.reweight_task = reweight_task_fair,
.prio_changed = prio_changed_fair,
+ .switching_from = switching_from_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 3c12d9f93331..980d92bab8ab 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true)
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
*/
-SCHED_FEAT(NEXT_BUDDY, false)
+SCHED_FEAT(NEXT_BUDDY, true)
/*
* Allow completely ignoring cfs_rq->next; which can be set from various
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(LATENCY_WARN, false)
+
+/*
+ * Do newidle balancing proportional to its success rate using randomization.
+ */
+SCHED_FEAT(NI_RANDOM, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c39b089d4f09..c174afe1dd17 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -131,12 +131,13 @@ void __cpuidle default_idle_call(void)
}
static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
+ struct cpuidle_device *dev,
+ u64 max_latency_ns)
{
if (current_clr_polling_and_test())
return -EBUSY;
- return cpuidle_enter_s2idle(drv, dev);
+ return cpuidle_enter_s2idle(drv, dev, max_latency_ns);
}
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
@@ -205,12 +206,13 @@ static void cpuidle_idle_call(void)
u64 max_latency_ns;
if (idle_should_enter_s2idle()) {
+ max_latency_ns = cpu_wakeup_latency_qos_limit() *
+ NSEC_PER_USEC;
- entered_state = call_cpuidle_s2idle(drv, dev);
+ entered_state = call_cpuidle_s2idle(drv, dev,
+ max_latency_ns);
if (entered_state > 0)
goto exit_idle;
-
- max_latency_ns = U64_MAX;
} else {
max_latency_ns = dev->forced_idle_latency_limit_ns;
}
@@ -452,9 +454,11 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
resched_curr(rq);
}
+static void update_curr_idle(struct rq *rq);
+
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
- dl_server_update_idle_time(rq, prev);
+ update_curr_idle(rq);
scx_update_idle(rq, false, true);
}
@@ -466,7 +470,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
next->se.exec_start = rq_clock_task(rq);
}
-struct task_struct *pick_task_idle(struct rq *rq)
+struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf)
{
scx_update_idle(rq, true, false);
return rq->idle;
@@ -496,21 +500,36 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
*/
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
+ update_curr_idle(rq);
}
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
+static void switching_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio)
{
+ if (p->prio == oldprio)
+ return;
+
BUG();
}
static void update_curr_idle(struct rq *rq)
{
+ struct sched_entity *se = &rq->idle->se;
+ u64 now = rq_clock_task(rq);
+ s64 delta_exec;
+
+ delta_exec = now - se->exec_start;
+ if (unlikely(delta_exec <= 0))
+ return;
+
+ se->exec_start = now;
+
+ dl_server_update_idle(&rq->fair_server, delta_exec);
}
/*
@@ -518,6 +537,8 @@ static void update_curr_idle(struct rq *rq)
*/
DEFINE_SCHED_CLASS(idle) = {
+ .queue_mask = 0,
+
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
@@ -536,6 +557,6 @@ DEFINE_SCHED_CLASS(idle) = {
.task_tick = task_tick_idle,
.prio_changed = prio_changed_idle,
- .switched_to = switched_to_idle,
+ .switching_to = switching_to_idle,
.update_curr = update_curr_idle,
};
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index a4cf17b1fab0..3ad0d6df6a0a 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -167,6 +167,29 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
}
}
+ /*
+ * Check the combination of nohz_full and isolcpus=domain,
+ * necessary to avoid problems with the timer migration
+ * hierarchy. managed_irq is ignored by this check since it
+ * isn't considered in the timer migration logic.
+ */
+ iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
+ type = find_first_bit(&iter_flags, HK_TYPE_MAX);
+ /*
+ * Pass the check if none of these flags were previously set or
+ * are not in the current selection.
+ */
+ iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
+ first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 :
+ cpumask_first_and_and(cpu_present_mask,
+ housekeeping_staging, housekeeping.cpumasks[type]);
+ if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) {
+ pr_warn("Housekeeping: must include one present CPU "
+ "neither in nohz_full= nor in isolcpus=domain, "
+ "ignoring setting %s\n", str);
+ goto free_housekeeping_staging;
+ }
+
iter_flags = flags & ~housekeeping.flags;
for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 62fba83b7bb1..623445603725 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -199,7 +199,7 @@ static void ipi_rseq(void *info)
* is negligible.
*/
smp_mb();
- rseq_preempt(current);
+ rseq_sched_switch_event(current);
}
static void ipi_sync_rq_state(void *info)
@@ -407,9 +407,9 @@ static int membarrier_private_expedited(int flags, int cpu_id)
* membarrier, we will end up with some thread in the mm
* running without a core sync.
*
- * For RSEQ, don't rseq_preempt() the caller. User code
- * is not supposed to issue syscalls at all from inside an
- * rseq critical section.
+ * For RSEQ, don't invoke rseq_sched_switch_event() on the
+ * caller. User code is not supposed to issue syscalls at
+ * all from inside an rseq critical section.
*/
if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
preempt_disable();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7936d4333731..f1867fe8e5c5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1490,7 +1490,7 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
static void yield_task_rt(struct rq *rq)
{
- requeue_task_rt(rq, rq->curr, 0);
+ requeue_task_rt(rq, rq->donor, 0);
}
static int find_lowest_rq(struct task_struct *task);
@@ -1695,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
-static struct task_struct *pick_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf)
{
struct task_struct *p;
@@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* us to initiate a push or pull.
*/
static void
-prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
+ if (p->prio == oldprio)
+ return;
+
if (task_current_donor(rq, p)) {
/*
* If our priority decreases while running, we
@@ -2566,6 +2569,8 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
DEFINE_SCHED_CLASS(rt) = {
+ .queue_mask = 4,
+
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
@@ -2589,8 +2594,8 @@ DEFINE_SCHED_CLASS(rt) = {
.get_rr_interval = get_rr_interval_rt,
- .prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+ .prio_changed = prio_changed_rt,
.update_curr = update_curr_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1f5d07067f60..8590113e4a60 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
#ifndef _KERNEL_SCHED_SCHED_H
#define _KERNEL_SCHED_SCHED_H
+#include <linux/prandom.h>
#include <linux/sched/affinity.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/cpufreq.h>
@@ -20,7 +21,6 @@
#include <linux/sched/task_flags.h>
#include <linux/sched/task.h>
#include <linux/sched/topology.h>
-
#include <linux/atomic.h>
#include <linux/bitmap.h>
#include <linux/bug.h>
@@ -405,6 +405,7 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6
* naturally thottled to once per period, avoiding high context switch
* workloads from spamming the hrtimer program/cancel paths.
*/
+extern void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec);
extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
extern void dl_server_start(struct sched_dl_entity *dl_se);
extern void dl_server_stop(struct sched_dl_entity *dl_se);
@@ -412,8 +413,6 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_pick_f pick_task);
extern void sched_init_dl_servers(void);
-extern void dl_server_update_idle_time(struct rq *rq,
- struct task_struct *p);
extern void fair_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
@@ -682,10 +681,10 @@ struct cfs_rq {
s64 avg_vruntime;
u64 avg_load;
- u64 min_vruntime;
+ u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
- u64 min_vruntime_fi;
+ u64 zero_vruntime_fi;
#endif
struct rb_root_cached tasks_timeline;
@@ -780,10 +779,10 @@ enum scx_rq_flags {
*/
SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1,
- SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
+ SCX_RQ_BAL_CB_PENDING = 1 << 6, /* must queue a cb after dispatching */
SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17,
@@ -1119,6 +1118,8 @@ struct rq {
/* runqueue lock: */
raw_spinlock_t __lock;
+ /* Per class runqueue modification mask; bits in class order. */
+ unsigned int queue_mask;
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p)
}
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+static inline u32 sched_rng(void)
+{
+ return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
+}
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() this_cpu_ptr(&runqueues)
@@ -1431,6 +1438,9 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
if (!sched_core_enabled(rq))
return true;
+ if (rq->core->core_cookie == p->core_cookie)
+ return true;
+
for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) {
if (!available_idle_cpu(cpu)) {
idle_core = false;
@@ -1442,7 +1452,7 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
* A CPU in an idle core is always the best choice for tasks with
* cookies.
*/
- return idle_core || rq->core->core_cookie == p->core_cookie;
+ return idle_core;
}
static inline bool sched_group_cookie_match(struct rq *rq,
@@ -1826,7 +1836,8 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock);
-static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
+static inline void
+__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
@@ -1838,8 +1849,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
__releases(p->pi_lock)
{
- rq_unpin_lock(rq, rf);
- raw_spin_rq_unlock(rq);
+ __task_rq_unlock(rq, p, rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
@@ -1848,6 +1858,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+ _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+ __task_rq_unlock(_T->rq, _T->lock, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
+
static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
@@ -2208,6 +2223,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
smp_wmb();
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
p->wake_cpu = cpu;
+ rseq_sched_set_ids_changed(p);
#endif /* CONFIG_SMP */
}
@@ -2341,8 +2357,7 @@ extern const u32 sched_prio_to_wmult[40];
/*
* {de,en}queue flags:
*
- * DEQUEUE_SLEEP - task is no longer runnable
- * ENQUEUE_WAKEUP - task just became runnable
+ * SLEEP/WAKEUP - task is no-longer/just-became runnable
*
* SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
* are in a known state which allows modification. Such pairs
@@ -2355,34 +2370,46 @@ extern const u32 sched_prio_to_wmult[40];
*
* MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
*
+ * DELAYED - de/re-queue a sched_delayed task
+ *
+ * CLASS - going to update p->sched_class; makes sched_change call the
+ * various switch methods.
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
* ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
*
+ * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but
+ * SCHED_DEADLINE seems to rely on this for now.
*/
-#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */
-#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
-#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
-#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
-#define DEQUEUE_SPECIAL 0x10
-#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
-#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
-#define DEQUEUE_THROTTLE 0x800
-
-#define ENQUEUE_WAKEUP 0x01
-#define ENQUEUE_RESTORE 0x02
-#define ENQUEUE_MOVE 0x04
-#define ENQUEUE_NOCLOCK 0x08
-
-#define ENQUEUE_HEAD 0x10
-#define ENQUEUE_REPLENISH 0x20
-#define ENQUEUE_MIGRATED 0x40
-#define ENQUEUE_INITIAL 0x80
-#define ENQUEUE_MIGRATING 0x100
-#define ENQUEUE_DELAYED 0x200
-#define ENQUEUE_RQ_SELECTED 0x400
+#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */
+#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */
+
+#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */
+
+#define DEQUEUE_SPECIAL 0x00010000
+#define DEQUEUE_THROTTLE 0x00020000
+
+#define ENQUEUE_WAKEUP 0x0001
+#define ENQUEUE_RESTORE 0x0002
+#define ENQUEUE_MOVE 0x0004
+#define ENQUEUE_NOCLOCK 0x0008
+
+#define ENQUEUE_MIGRATING 0x0010
+#define ENQUEUE_DELAYED 0x0020
+#define ENQUEUE_CLASS 0x0040
+
+#define ENQUEUE_HEAD 0x00010000
+#define ENQUEUE_REPLENISH 0x00020000
+#define ENQUEUE_MIGRATED 0x00040000
+#define ENQUEUE_INITIAL 0x00080000
+#define ENQUEUE_RQ_SELECTED 0x00100000
#define RETRY_TASK ((void *)-1UL)
@@ -2399,16 +2426,61 @@ struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
+ /*
+ * idle: 0
+ * ext: 1
+ * fair: 2
+ * rt: 4
+ * dl: 8
+ * stop: 16
+ */
+ unsigned int queue_mask;
+ /*
+ * move_queued_task/activate_task/enqueue_task: rq->lock
+ * ttwu_do_activate/activate_task/enqueue_task: rq->lock
+ * wake_up_new_task/activate_task/enqueue_task: task_rq_lock
+ * ttwu_runnable/enqueue_task: task_rq_lock
+ * proxy_task_current: rq->lock
+ * sched_change_end
+ */
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * move_queued_task/deactivate_task/dequeue_task: rq->lock
+ * __schedule/block_task/dequeue_task: rq->lock
+ * proxy_task_current: rq->lock
+ * wait_task_inactive: task_rq_lock
+ * sched_change_begin
+ */
bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * do_sched_yield: rq->lock
+ */
void (*yield_task) (struct rq *rq);
+ /*
+ * yield_to: rq->lock (double)
+ */
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
+ /*
+ * move_queued_task: rq->lock
+ * __migrate_swap_task: rq->lock
+ * ttwu_do_activate: rq->lock
+ * ttwu_runnable: task_rq_lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * schedule/pick_next_task/prev_balance: rq->lock
+ */
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
- struct task_struct *(*pick_task)(struct rq *rq);
+
+ /*
+ * schedule/pick_next_task: rq->lock
+ */
+ struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
/*
* Optional! When implemented pick_next_task() should be equivalent to:
*
@@ -2418,55 +2490,123 @@ struct sched_class {
* set_next_task_first(next);
* }
*/
- struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+ struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf);
+ /*
+ * sched_change:
+ * __schedule: rq->lock
+ */
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+ /*
+ * select_task_rq: p->pi_lock
+ * sched_exec: p->pi_lock
+ */
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+ /*
+ * set_task_cpu: p->pi_lock || rq->lock (ttwu like)
+ */
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
+ /*
+ * ttwu_do_activate: rq->lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
+ /*
+ * do_set_cpus_allowed: task_rq_lock + sched_change
+ */
void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
+ /*
+ * sched_set_rq_{on,off}line: rq->lock
+ */
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+ /*
+ * push_cpu_stop: p->pi_lock && rq->lock
+ */
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
+ /*
+ * hrtick: rq->lock
+ * sched_tick: rq->lock
+ * sched_tick_remote: rq->lock
+ */
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+ /*
+ * sched_cgroup_fork: p->pi_lock
+ */
void (*task_fork)(struct task_struct *p);
+ /*
+ * finish_task_switch: no locks
+ */
void (*task_dead)(struct task_struct *p);
/*
- * The switched_from() call is allowed to drop rq->lock, therefore we
- * cannot assume the switched_from/switched_to pair is serialized by
- * rq->lock. They are however serialized by p->pi_lock.
+ * sched_change
+ */
+ void (*switching_from)(struct rq *this_rq, struct task_struct *task);
+ void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+ void (*switching_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ u64 (*get_prio) (struct rq *this_rq, struct task_struct *task);
+ void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+ u64 oldprio);
+
+ /*
+ * set_load_weight: task_rq_lock + sched_change
+ * __setscheduler_parms: task_rq_lock + sched_change
*/
- void (*switching_to) (struct rq *this_rq, struct task_struct *task);
- void (*switched_from)(struct rq *this_rq, struct task_struct *task);
- void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
const struct load_weight *lw);
- void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
- int oldprio);
+ /*
+ * sched_rr_get_interval: task_rq_lock
+ */
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
+ /*
+ * task_sched_runtime: task_rq_lock
+ */
void (*update_curr)(struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * sched_change_group: task_rq_lock + sched_change
+ */
void (*task_change_group)(struct task_struct *p);
#endif
#ifdef CONFIG_SCHED_CORE
+ /*
+ * pick_next_task: rq->lock
+ * try_steal_cookie: rq->lock (double)
+ */
int (*task_is_throttled)(struct task_struct *p, int cpu);
#endif
};
+/*
+ * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
+ */
+static inline void rq_modified_clear(struct rq *rq)
+{
+ rq->queue_mask = 0;
+}
+
+static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
+{
+ unsigned int mask = class->queue_mask;
+ return rq->queue_mask & ~((mask << 1) - 1);
+}
+
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->donor != prev);
@@ -2578,8 +2718,9 @@ static inline bool sched_fair_runnable(struct rq *rq)
return rq->cfs.nr_queued > 0;
}
-extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-extern struct task_struct *pick_task_idle(struct rq *rq);
+extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf);
+extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
@@ -2609,7 +2750,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
static inline cpumask_t *alloc_user_cpus_ptr(int node)
{
/*
- * See do_set_cpus_allowed() above for the rcu_head usage.
+ * See set_cpus_allowed_force() above for the rcu_head usage.
*/
int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
@@ -3539,285 +3680,212 @@ extern const char *preempt_modes[];
#ifdef CONFIG_SCHED_MM_CID
-#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
-#define MM_CID_SCAN_DELAY 100 /* 100ms */
+static __always_inline bool cid_on_cpu(unsigned int cid)
+{
+ return cid & MM_CID_ONCPU;
+}
-extern raw_spinlock_t cid_lock;
-extern int use_cid_lock;
+static __always_inline bool cid_in_transit(unsigned int cid)
+{
+ return cid & MM_CID_TRANSIT;
+}
-extern void sched_mm_cid_migrate_from(struct task_struct *t);
-extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
-extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
-extern void init_sched_mm_cid(struct task_struct *t);
+static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid)
+{
+ return cid & ~MM_CID_ONCPU;
+}
-static inline void __mm_cid_put(struct mm_struct *mm, int cid)
+static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid)
{
- if (cid < 0)
- return;
- cpumask_clear_cpu(cid, mm_cidmask(mm));
+ return cid | MM_CID_ONCPU;
}
-/*
- * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
- * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
- * be held to transition to other states.
- *
- * State transitions synchronized with cmpxchg or try_cmpxchg need to be
- * consistent across CPUs, which prevents use of this_cpu_cmpxchg.
- */
-static inline void mm_cid_put_lazy(struct task_struct *t)
+static __always_inline unsigned int cid_to_transit_cid(unsigned int cid)
{
- struct mm_struct *mm = t->mm;
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid;
+ return cid | MM_CID_TRANSIT;
+}
- lockdep_assert_irqs_disabled();
- cid = __this_cpu_read(pcpu_cid->cid);
- if (!mm_cid_is_lazy_put(cid) ||
- !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
- return;
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+static __always_inline unsigned int cid_from_transit_cid(unsigned int cid)
+{
+ return cid & ~MM_CID_TRANSIT;
}
-static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
+static __always_inline bool cid_on_task(unsigned int cid)
{
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid, res;
+ /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */
+ return cid < MM_CID_TRANSIT;
+}
- lockdep_assert_irqs_disabled();
- cid = __this_cpu_read(pcpu_cid->cid);
- for (;;) {
- if (mm_cid_is_unset(cid))
- return MM_CID_UNSET;
- /*
- * Attempt transition from valid or lazy-put to unset.
- */
- res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
- if (res == cid)
- break;
- cid = res;
- }
- return cid;
+static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid)
+{
+ clear_bit(cid, mm_cidmask(mm));
}
-static inline void mm_cid_put(struct mm_struct *mm)
+static __always_inline void mm_unset_cid_on_task(struct task_struct *t)
{
- int cid;
+ unsigned int cid = t->mm_cid.cid;
- lockdep_assert_irqs_disabled();
- cid = mm_cid_pcpu_unset(mm);
- if (cid == MM_CID_UNSET)
- return;
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ t->mm_cid.cid = MM_CID_UNSET;
+ if (cid_on_task(cid))
+ mm_drop_cid(t->mm, cid);
}
-static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
+static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp)
{
- struct cpumask *cidmask = mm_cidmask(mm);
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid, max_nr_cid, allowed_max_nr_cid;
+ /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */
+ pcp->cid = cpu_cid_to_cid(pcp->cid);
+ mm_drop_cid(mm, pcp->cid);
+}
- /*
- * After shrinking the number of threads or reducing the number
- * of allowed cpus, reduce the value of max_nr_cid so expansion
- * of cid allocation will preserve cache locality if the number
- * of threads or allowed cpus increase again.
- */
- max_nr_cid = atomic_read(&mm->max_nr_cid);
- while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed),
- atomic_read(&mm->mm_users))),
- max_nr_cid > allowed_max_nr_cid) {
- /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */
- if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) {
- max_nr_cid = allowed_max_nr_cid;
- break;
- }
- }
- /* Try to re-use recent cid. This improves cache locality. */
- cid = __this_cpu_read(pcpu_cid->recent_cid);
- if (!mm_cid_is_unset(cid) && cid < max_nr_cid &&
- !cpumask_test_and_set_cpu(cid, cidmask))
- return cid;
- /*
- * Expand cid allocation if the maximum number of concurrency
- * IDs allocated (max_nr_cid) is below the number cpus allowed
- * and number of threads. Expanding cid allocation as much as
- * possible improves cache locality.
- */
- cid = max_nr_cid;
- while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) {
- /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */
- if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1))
- continue;
- if (!cpumask_test_and_set_cpu(cid, cidmask))
- return cid;
- }
- /*
- * Find the first available concurrency id.
- * Retry finding first zero bit if the mask is temporarily
- * filled. This only happens during concurrent remote-clear
- * which owns a cid without holding a rq lock.
- */
- for (;;) {
- cid = cpumask_first_zero(cidmask);
- if (cid < READ_ONCE(mm->nr_cpus_allowed))
- break;
- cpu_relax();
- }
- if (cpumask_test_and_set_cpu(cid, cidmask))
- return -1;
+static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids)
+{
+ unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids);
+ if (cid >= max_cids)
+ return MM_CID_UNSET;
+ if (test_and_set_bit(cid, mm_cidmask(mm)))
+ return MM_CID_UNSET;
return cid;
}
-/*
- * Save a snapshot of the current runqueue time of this cpu
- * with the per-cpu cid value, allowing to estimate how recently it was used.
- */
-static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
+static inline unsigned int mm_get_cid(struct mm_struct *mm)
{
- struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
+ unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids));
- lockdep_assert_rq_held(rq);
- WRITE_ONCE(pcpu_cid->time, rq->clock);
+ while (cid == MM_CID_UNSET) {
+ cpu_relax();
+ cid = __mm_get_cid(mm, num_possible_cpus());
+ }
+ return cid;
}
-static inline int __mm_cid_get(struct rq *rq, struct task_struct *t,
- struct mm_struct *mm)
+static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid,
+ unsigned int max_cids)
{
- int cid;
+ unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid);
- /*
- * All allocations (even those using the cid_lock) are lock-free. If
- * use_cid_lock is set, hold the cid_lock to perform cid allocation to
- * guarantee forward progress.
- */
- if (!READ_ONCE(use_cid_lock)) {
- cid = __mm_cid_try_get(t, mm);
- if (cid >= 0)
- goto end;
- raw_spin_lock(&cid_lock);
- } else {
- raw_spin_lock(&cid_lock);
- cid = __mm_cid_try_get(t, mm);
- if (cid >= 0)
- goto unlock;
+ /* Is it in the optimal CID space? */
+ if (likely(cid < max_cids))
+ return orig_cid;
+
+ /* Try to find one in the optimal space. Otherwise keep the provided. */
+ new_cid = __mm_get_cid(mm, max_cids);
+ if (new_cid != MM_CID_UNSET) {
+ mm_drop_cid(mm, cid);
+ /* Preserve the ONCPU mode of the original CID */
+ return new_cid | (orig_cid & MM_CID_ONCPU);
}
+ return orig_cid;
+}
- /*
- * cid concurrently allocated. Retry while forcing following
- * allocations to use the cid_lock to ensure forward progress.
- */
- WRITE_ONCE(use_cid_lock, 1);
- /*
- * Set use_cid_lock before allocation. Only care about program order
- * because this is only required for forward progress.
- */
- barrier();
- /*
- * Retry until it succeeds. It is guaranteed to eventually succeed once
- * all newcoming allocations observe the use_cid_lock flag set.
- */
- do {
- cid = __mm_cid_try_get(t, mm);
- cpu_relax();
- } while (cid < 0);
- /*
- * Allocate before clearing use_cid_lock. Only care about
- * program order because this is for forward progress.
- */
- barrier();
- WRITE_ONCE(use_cid_lock, 0);
-unlock:
- raw_spin_unlock(&cid_lock);
-end:
- mm_cid_snapshot_time(rq, mm);
+static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid)
+{
+ if (t->mm_cid.cid != cid) {
+ t->mm_cid.cid = cid;
+ rseq_sched_set_ids_changed(t);
+ }
+}
- return cid;
+static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid)
+{
+ __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
}
-static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
- struct mm_struct *mm)
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
{
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- struct cpumask *cpumask;
- int cid;
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
- lockdep_assert_rq_held(rq);
- cpumask = mm_cidmask(mm);
- cid = __this_cpu_read(pcpu_cid->cid);
- if (mm_cid_is_valid(cid)) {
- mm_cid_snapshot_time(rq, mm);
- return cid;
- }
- if (mm_cid_is_lazy_put(cid)) {
- if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case where both have the ONCPU bit set */
+ if (likely(cid_on_cpu(cpu_cid & tcid))) {
+ if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) {
+ mm_cid_update_task_cid(t, cpu_cid);
+ return;
+ }
+ /* Try to converge into the optimal CID space */
+ cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids);
+ } else {
+ /* Hand over or drop the task owned CID */
+ if (cid_on_task(tcid)) {
+ if (cid_on_cpu(cpu_cid))
+ mm_unset_cid_on_task(t);
+ else
+ cpu_cid = cid_to_cpu_cid(tcid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_cpu(cpu_cid))
+ cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
}
- cid = __mm_cid_get(rq, t, mm);
- __this_cpu_write(pcpu_cid->cid, cid);
- __this_cpu_write(pcpu_cid->recent_cid, cid);
-
- return cid;
+ mm_cid_update_pcpu_cid(mm, cpu_cid);
+ mm_cid_update_task_cid(t, cpu_cid);
}
-static inline void switch_mm_cid(struct rq *rq,
- struct task_struct *prev,
- struct task_struct *next)
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
{
- /*
- * Provide a memory barrier between rq->curr store and load of
- * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
- *
- * Should be adapted if context_switch() is modified.
- */
- if (!next->mm) { // to kernel
- /*
- * user -> kernel transition does not guarantee a barrier, but
- * we can use the fact that it performs an atomic operation in
- * mmgrab().
- */
- if (prev->mm) // from user
- smp_mb__after_mmgrab();
- /*
- * kernel -> kernel transition does not change rq->curr->mm
- * state. It stays NULL.
- */
- } else { // to user
- /*
- * kernel -> user transition does not provide a barrier
- * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
- * Provide it here.
- */
- if (!prev->mm) { // from kernel
- smp_mb();
- } else { // from user
- /*
- * user->user transition relies on an implicit
- * memory barrier in switch_mm() when
- * current->mm changes. If the architecture
- * switch_mm() does not have an implicit memory
- * barrier, it is emitted here. If current->mm
- * is unchanged, no barrier is needed.
- */
- smp_mb__after_switch_mm();
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
+
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case, where both have the ONCPU bit clear */
+ if (likely(cid_on_task(tcid | cpu_cid))) {
+ if (likely(tcid < max_cids)) {
+ mm_cid_update_pcpu_cid(mm, tcid);
+ return;
}
+ /* Try to converge into the optimal CID space */
+ tcid = mm_cid_converge(mm, tcid, max_cids);
+ } else {
+ /* Hand over or drop the CPU owned CID */
+ if (cid_on_cpu(cpu_cid)) {
+ if (cid_on_task(tcid))
+ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
+ else
+ tcid = cpu_cid_to_cid(cpu_cid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_task(tcid))
+ tcid = mm_get_cid(mm);
+ /* Set the transition mode flag if required */
+ tcid |= READ_ONCE(mm->mm_cid.transit);
}
- if (prev->mm_cid_active) {
- mm_cid_snapshot_time(rq, prev->mm);
- mm_cid_put_lazy(prev);
- prev->mm_cid = -1;
- }
- if (next->mm_cid_active)
- next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
+ mm_cid_update_pcpu_cid(mm, tcid);
+ mm_cid_update_task_cid(t, tcid);
+}
+
+static __always_inline void mm_cid_schedin(struct task_struct *next)
+{
+ struct mm_struct *mm = next->mm;
+ unsigned int cpu_cid;
+
+ if (!next->mm_cid.active)
+ return;
+
+ cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
+ if (likely(!READ_ONCE(mm->mm_cid.percpu)))
+ mm_cid_from_task(next, cpu_cid);
+ else
+ mm_cid_from_cpu(next, cpu_cid);
+}
+
+static __always_inline void mm_cid_schedout(struct task_struct *prev)
+{
+ /* During mode transitions CIDs are temporary and need to be dropped */
+ if (likely(!cid_in_transit(prev->mm_cid.cid)))
+ return;
+
+ mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid));
+ prev->mm_cid.cid = MM_CID_UNSET;
+}
+
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)
+{
+ mm_cid_schedout(prev);
+ mm_cid_schedin(next);
}
#else /* !CONFIG_SCHED_MM_CID: */
-static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
-static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
-static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
-static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
-static inline void init_sched_mm_cid(struct task_struct *t) { }
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
@@ -3876,32 +3944,42 @@ extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class);
-extern void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio);
-
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
-#ifdef CONFIG_SCHED_CLASS_EXT
/*
- * Used by SCX in the enable/disable paths to move tasks between sched_classes
- * and establish invariants.
+ * The 'sched_change' pattern is the safe, easy and slow way of changing a
+ * task's scheduling properties. It dequeues a task, such that the scheduler
+ * is fully unaware of it; at which point its properties can be modified;
+ * after which it is enqueued again.
+ *
+ * Typically this must be called while holding task_rq_lock, since most/all
+ * properties are serialized under those locks. There is currently one
+ * exception to this rule in sched/ext which only holds rq->lock.
*/
-struct sched_enq_and_set_ctx {
+
+/*
+ * This structure is a temporary, used to preserve/convey the queueing state
+ * of the task between sched_change_begin() and sched_change_end(). Ensuring
+ * the task's queueing state is idempotent across the operation.
+ */
+struct sched_change_ctx {
+ u64 prio;
struct task_struct *p;
- int queue_flags;
+ int flags;
bool queued;
bool running;
};
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx);
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
+void sched_change_end(struct sched_change_ctx *ctx);
-#endif /* CONFIG_SCHED_CLASS_EXT */
+DEFINE_CLASS(sched_change, struct sched_change_ctx *,
+ sched_change_end(_T),
+ sched_change_begin(p, flags),
+ struct task_struct *p, unsigned int flags)
+
+DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
#include "ext.h"
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 26f3fd4d34ce..cbf7206b3f9d 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
rq = __task_rq_lock(p, &rf);
psi_task_change(p, p->psi_flags, 0);
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
}
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 2d4e279f05ee..4f9192be4b5b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -32,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *pick_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf)
{
if (!sched_stop_runnable(rq))
return NULL;
@@ -75,14 +75,17 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
+static void switching_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
}
static void
-prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio)
{
+ if (p->prio == oldprio)
+ return;
+
BUG(); /* how!?, what priority? */
}
@@ -95,6 +98,8 @@ static void update_curr_stop(struct rq *rq)
*/
DEFINE_SCHED_CLASS(stop) = {
+ .queue_mask = 16,
+
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
@@ -112,6 +117,6 @@ DEFINE_SCHED_CLASS(stop) = {
.task_tick = task_tick_stop,
.prio_changed = prio_changed_stop,
- .switched_to = switched_to_stop,
+ .switching_to = switching_to_stop,
.update_curr = update_curr_stop,
};
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 77ae87f36e84..0496dc29ed0f 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -64,8 +64,6 @@ static int effective_prio(struct task_struct *p)
void set_user_nice(struct task_struct *p, long nice)
{
- bool queued, running;
- struct rq *rq;
int old_prio;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -74,10 +72,7 @@ void set_user_nice(struct task_struct *p, long nice)
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- CLASS(task_rq_lock, rq_guard)(p);
- rq = rq_guard.rq;
-
- update_rq_clock(rq);
+ guard(task_rq_lock)(p);
/*
* The RT priorities are set via sched_setscheduler(), but we still
@@ -90,28 +85,12 @@ void set_user_nice(struct task_struct *p, long nice)
return;
}
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- if (running)
- put_prev_task(rq, p);
-
- p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p, true);
- old_prio = p->prio;
- p->prio = effective_prio(p);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
-
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- p->sched_class->prio_changed(rq, p, old_prio);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p, true);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ }
}
EXPORT_SYMBOL(set_user_nice);
@@ -515,7 +494,7 @@ int __sched_setscheduler(struct task_struct *p,
bool user, bool pi)
{
int oldpolicy = -1, policy = attr->sched_policy;
- int retval, oldprio, newprio, queued, running;
+ int retval, oldprio, newprio;
const struct sched_class *prev_class, *next_class;
struct balance_callback *head;
struct rq_flags rf;
@@ -695,38 +674,27 @@ change:
prev_class = p->sched_class;
next_class = __setscheduler_class(policy, newprio);
- if (prev_class != next_class && p->se.sched_delayed)
- dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flags);
- if (running)
- put_prev_task(rq, p);
+ if (prev_class != next_class)
+ queue_flags |= DEQUEUE_CLASS;
- if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
- __setscheduler_params(p, attr);
- p->sched_class = next_class;
- p->prio = newprio;
- }
- __setscheduler_uclamp(p, attr);
- check_class_changing(rq, p, prev_class);
+ scoped_guard (sched_change, p, queue_flags) {
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- if (oldprio < p->prio)
- queue_flags |= ENQUEUE_HEAD;
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ p->sched_class = next_class;
+ p->prio = newprio;
+ }
+ __setscheduler_uclamp(p, attr);
- enqueue_task(rq, p, queue_flags);
+ if (scope->queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ if (oldprio < p->prio)
+ scope->flags |= ENQUEUE_HEAD;
+ }
}
- if (running)
- set_next_task(rq, p);
-
- check_class_changed(rq, p, prev_class, oldprio);
/* Avoid rq from going away on us: */
preempt_disable();
@@ -856,6 +824,19 @@ void sched_set_fifo_low(struct task_struct *p)
}
EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+/*
+ * Used when the primary interrupt handler is forced into a thread, in addition
+ * to the (always threaded) secondary handler. The secondary handler gets a
+ * slightly lower priority so that the primary handler can preempt it, thereby
+ * emulating the behavior of a non-PREEMPT_RT system where the primary handler
+ * runs in hard interrupt context.
+ */
+void sched_set_fifo_secondary(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+
void sched_set_normal(struct task_struct *p, int nice)
{
struct sched_attr attr = {
@@ -1351,7 +1332,7 @@ static void do_sched_yield(void)
rq = this_rq_lock_irq(&rf);
schedstat_inc(rq->yld_count);
- current->sched_class->yield_task(rq);
+ rq->donor->sched_class->yield_task(rq);
preempt_disable();
rq_unlock_irq(rq, &rf);
@@ -1420,12 +1401,13 @@ EXPORT_SYMBOL(yield);
*/
int __sched yield_to(struct task_struct *p, bool preempt)
{
- struct task_struct *curr = current;
+ struct task_struct *curr;
struct rq *rq, *p_rq;
int yielded = 0;
scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
rq = this_rq();
+ curr = rq->donor;
again:
p_rq = task_rq(p);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 444bdfdab731..cf643a5ddedd 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1590,10 +1590,17 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
#ifdef CONFIG_NUMA
enum numa_topology_type sched_numa_topology_type;
+/*
+ * sched_domains_numa_distance is derived from sched_numa_node_distance
+ * and provides a simplified view of NUMA distances used specifically
+ * for building NUMA scheduling domains.
+ */
static int sched_domains_numa_levels;
+static int sched_numa_node_levels;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
+static int *sched_numa_node_distance;
static struct cpumask ***sched_domains_numa_masks;
#endif /* CONFIG_NUMA */
@@ -1662,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
+
+ /* 50% success rate */
+ .newidle_call = 512,
+ .newidle_success = 256,
+ .newidle_ratio = 512,
+
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
.child = child,
@@ -1845,10 +1858,10 @@ bool find_numa_distance(int distance)
return true;
rcu_read_lock();
- distances = rcu_dereference(sched_domains_numa_distance);
+ distances = rcu_dereference(sched_numa_node_distance);
if (!distances)
goto unlock;
- for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (i = 0; i < sched_numa_node_levels; i++) {
if (distances[i] == distance) {
found = true;
break;
@@ -1924,14 +1937,34 @@ static void init_numa_topology_type(int offline_node)
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
-void sched_init_numa(int offline_node)
+/*
+ * An architecture could modify its NUMA distance, to change
+ * grouping of NUMA nodes and number of NUMA levels when creating
+ * NUMA level sched domains.
+ *
+ * A NUMA level is created for each unique
+ * arch_sched_node_distance.
+ */
+static int numa_node_dist(int i, int j)
{
- struct sched_domain_topology_level *tl;
- unsigned long *distance_map;
+ return node_distance(i, j);
+}
+
+int arch_sched_node_distance(int from, int to)
+ __weak __alias(numa_node_dist);
+
+static bool modified_sched_node_distance(void)
+{
+ return numa_node_dist != arch_sched_node_distance;
+}
+
+static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
+ int **dist, int *levels)
+{
+ unsigned long *distance_map __free(bitmap) = NULL;
int nr_levels = 0;
int i, j;
int *distances;
- struct cpumask ***masks;
/*
* O(nr_nodes^2) de-duplicating selection sort -- in order to find the
@@ -1939,17 +1972,16 @@ void sched_init_numa(int offline_node)
*/
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
if (!distance_map)
- return;
+ return -ENOMEM;
bitmap_zero(distance_map, NR_DISTANCE_VALUES);
for_each_cpu_node_but(i, offline_node) {
for_each_cpu_node_but(j, offline_node) {
- int distance = node_distance(i, j);
+ int distance = n_dist(i, j);
if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
sched_numa_warn("Invalid distance value range");
- bitmap_free(distance_map);
- return;
+ return -EINVAL;
}
bitmap_set(distance_map, distance, 1);
@@ -1962,18 +1994,46 @@ void sched_init_numa(int offline_node)
nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
- if (!distances) {
- bitmap_free(distance_map);
- return;
- }
+ if (!distances)
+ return -ENOMEM;
for (i = 0, j = 0; i < nr_levels; i++, j++) {
j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
distances[i] = j;
}
- rcu_assign_pointer(sched_domains_numa_distance, distances);
+ *dist = distances;
+ *levels = nr_levels;
- bitmap_free(distance_map);
+ return 0;
+}
+
+void sched_init_numa(int offline_node)
+{
+ struct sched_domain_topology_level *tl;
+ int nr_levels, nr_node_levels;
+ int i, j;
+ int *distances, *domain_distances;
+ struct cpumask ***masks;
+
+ /* Record the NUMA distances from SLIT table */
+ if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
+ &nr_node_levels))
+ return;
+
+ /* Record modified NUMA distances for building sched domains */
+ if (modified_sched_node_distance()) {
+ if (sched_record_numa_dist(offline_node, arch_sched_node_distance,
+ &domain_distances, &nr_levels)) {
+ kfree(distances);
+ return;
+ }
+ } else {
+ domain_distances = distances;
+ nr_levels = nr_node_levels;
+ }
+ rcu_assign_pointer(sched_numa_node_distance, distances);
+ WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]);
+ WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
/*
* 'nr_levels' contains the number of unique distances
@@ -1991,6 +2051,8 @@ void sched_init_numa(int offline_node)
*
* We reset it to 'nr_levels' at the end of this function.
*/
+ rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
+
sched_domains_numa_levels = 0;
masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
@@ -2016,10 +2078,13 @@ void sched_init_numa(int offline_node)
masks[i][j] = mask;
for_each_cpu_node_but(k, offline_node) {
- if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ if (sched_debug() &&
+ (arch_sched_node_distance(j, k) !=
+ arch_sched_node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
- if (node_distance(j, k) > sched_domains_numa_distance[i])
+ if (arch_sched_node_distance(j, k) >
+ sched_domains_numa_distance[i])
continue;
cpumask_or(mask, mask, cpumask_of_node(k));
@@ -2059,7 +2124,6 @@ void sched_init_numa(int offline_node)
sched_domain_topology = tl;
sched_domains_numa_levels = nr_levels;
- WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
init_numa_topology_type(offline_node);
}
@@ -2067,14 +2131,18 @@ void sched_init_numa(int offline_node)
static void sched_reset_numa(void)
{
- int nr_levels, *distances;
+ int nr_levels, *distances, *dom_distances = NULL;
struct cpumask ***masks;
nr_levels = sched_domains_numa_levels;
+ sched_numa_node_levels = 0;
sched_domains_numa_levels = 0;
sched_max_numa_distance = 0;
sched_numa_topology_type = NUMA_DIRECT;
- distances = sched_domains_numa_distance;
+ distances = sched_numa_node_distance;
+ if (sched_numa_node_distance != sched_domains_numa_distance)
+ dom_distances = sched_domains_numa_distance;
+ rcu_assign_pointer(sched_numa_node_distance, NULL);
rcu_assign_pointer(sched_domains_numa_distance, NULL);
masks = sched_domains_numa_masks;
rcu_assign_pointer(sched_domains_numa_masks, NULL);
@@ -2083,6 +2151,7 @@ static void sched_reset_numa(void)
synchronize_rcu();
kfree(distances);
+ kfree(dom_distances);
for (i = 0; i < nr_levels && masks; i++) {
if (!masks[i])
continue;
@@ -2129,7 +2198,8 @@ void sched_domains_numa_masks_set(unsigned int cpu)
continue;
/* Set ourselves in the remote node's masks */
- if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ if (arch_sched_node_distance(j, node) <=
+ sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
}
diff --git a/kernel/signal.c b/kernel/signal.c
index fe9190d84f28..e42b8bd6922f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3125,7 +3125,6 @@ void exit_signals(struct task_struct *tsk)
cgroup_threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
- sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
return;
@@ -3136,7 +3135,6 @@ void exit_signals(struct task_struct *tsk)
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
- sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d1efec571a4a..0f7519f8e7c9 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
#ifdef CONFIG_IRQ_WORK
static void task_work_set_notify_irq(struct irq_work *entry)
{
- test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+ /*
+ * no-op IPI
+ *
+ * TWA_NMI_CURRENT will already have set the TIF flag, all
+ * this interrupt does it tickle the return-to-user path.
+ */
}
static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
@@ -86,6 +91,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
break;
#ifdef CONFIG_IRQ_WORK
case TWA_NMI_CURRENT:
+ set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
break;
#endif
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 88aa062b8a55..f8ea8c8fc895 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2145,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
int ret;
hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
- hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
+ hrtimer_set_expires(&t.timer, restart->nanosleep.expires);
ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
destroy_hrtimer_on_stack(&t.timer);
return ret;
@@ -2172,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
restart = &current->restart_block;
restart->nanosleep.clockid = t.timer.base->clockid;
- restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+ restart->nanosleep.expires = hrtimer_get_expires(&t.timer);
set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
destroy_hrtimer_on_stack(&t.timer);
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 5b6997f4dc3d..e76be24b132c 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -478,11 +478,8 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .ns.ns_type = ns_common_type(&init_time_ns),
- .ns.__ns_ref = REFCOUNT_INIT(3),
+ .ns = NS_COMMON_INIT(init_time_ns),
.user_ns = &init_user_ns,
- .ns.inum = ns_init_inum(&init_time_ns),
- .ns.ops = &timens_operations,
.frozen_offsets = true,
};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2e5b89d7d866..0de2bb7cbec0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1557,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
* Report back to the user the time still remaining.
*/
restart = &current->restart_block;
- restart->nanosleep.expires = expires;
+ restart->nanosleep.expires = ns_to_ktime(expires);
if (restart->nanosleep.type != TT_NONE)
error = nanosleep_copyout(restart, &it.it_value);
}
@@ -1599,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
clockid_t which_clock = restart_block->nanosleep.clockid;
struct timespec64 t;
- t = ns_to_timespec64(restart_block->nanosleep.expires);
+ t = ktime_to_timespec64(restart_block->nanosleep.expires);
return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index aa3120104a51..80a8a09a21a0 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -475,12 +475,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
if (!kc->timer_create)
return -EOPNOTSUPP;
- new_timer = alloc_posix_timer();
- if (unlikely(!new_timer))
- return -EAGAIN;
-
- spin_lock_init(&new_timer->it_lock);
-
/* Special case for CRIU to restore timers with a given timer ID. */
if (unlikely(current->signal->timer_create_restore_ids)) {
if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
@@ -490,6 +484,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
return -EINVAL;
}
+ new_timer = alloc_posix_timer();
+ if (unlikely(!new_timer))
+ return -EAGAIN;
+
+ spin_lock_init(&new_timer->it_lock);
+
/*
* Add the timer to the hash table. The timer is not yet valid
* after insertion, but has a unique ID allocated.
@@ -1242,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
* sys_clock_settime(). The kernel internal timekeeping is always using
* nanoseconds precision independent of the clocksource device which is
* used to read the time from. The resolution of that device only
- * affects the presicion of the time returned by sys_clock_gettime().
+ * affects the precision of the time returned by sys_clock_gettime().
*
* Returns:
* 0 Success. @tp contains the resolution
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5e2c2c26b3cc..ffee943d796d 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -19,6 +19,10 @@
/**
* tick_program_event - program the CPU local timer device for the next event
+ * @expires: the time at which the next timer event should occur
+ * @force: flag to force reprograming even if the event time hasn't changed
+ *
+ * Return: 0 on success, negative error code on failure
*/
int tick_program_event(ktime_t expires, int force)
{
@@ -57,6 +61,13 @@ void tick_resume_oneshot(void)
/**
* tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
+ * @newdev: Pointer to the clock event device to configure
+ * @handler: Function to be called when the event device triggers an interrupt
+ * @next_event: Initial expiry time for the next event (in ktime)
+ *
+ * Configures the specified clock event device for onshot mode,
+ * assigns the given handler as its event callback, and programs
+ * the device to trigger at the specified next event time.
*/
void tick_setup_oneshot(struct clock_event_device *newdev,
void (*handler)(struct clock_event_device *),
@@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
/**
* tick_switch_to_oneshot - switch to oneshot mode
+ * @handler: function to call when an event occurs on the tick device
+ *
+ * Return: 0 on success, -EINVAL if the tick device is not present,
+ * not functional, or does not support oneshot mode.
*/
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
@@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
/**
* tick_oneshot_mode_active - check whether the system is in oneshot mode
*
- * returns 1 when either nohz or highres are enabled. otherwise 0.
+ * Return: 1 when either nohz or highres are enabled, otherwise 0.
*/
int tick_oneshot_mode_active(void)
{
@@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void)
* tick_init_highres - switch to high resolution mode
*
* Called with interrupts disabled.
+ *
+ * Return: 0 on success, -EINVAL if the tick device cannot switch
+ * to oneshot/high-resolution mode.
*/
int tick_init_highres(void)
{
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c527b421c865..8ddf74e705d3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -201,6 +201,27 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts,
ts->flags &= ~flag;
}
+/*
+ * Allow only one non-timekeeper CPU at a time update jiffies from
+ * the timer tick.
+ *
+ * Returns true if update was run.
+ */
+static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now)
+{
+ static atomic_t in_progress;
+ int inp;
+
+ inp = atomic_read(&in_progress);
+ if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1))
+ return false;
+
+ if (ts->last_tick_jiffies == jiffies)
+ tick_do_update_jiffies64(now);
+ atomic_set(&in_progress, 0);
+ return true;
+}
+
#define MAX_STALLED_JIFFIES 5
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
@@ -239,10 +260,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
} else {
- if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
- tick_do_update_jiffies64(now);
- ts->stalled_jiffies = 0;
- ts->last_tick_jiffies = READ_ONCE(jiffies);
+ if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) {
+ if (tick_limited_update_jiffies64(ts, now)) {
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ }
}
}
@@ -1152,16 +1174,15 @@ static bool report_idle_softirq(void)
return false;
}
- if (ratelimit >= 10)
- return false;
-
/* On RT, softirq handling may be waiting on some lock */
if (local_bh_blocked())
return false;
- pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
- pending);
- ratelimit++;
+ if (ratelimit < 10) {
+ pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
+ pending);
+ ratelimit++;
+ }
return true;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b6974fce800c..4790da895203 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -3060,29 +3060,34 @@ static const struct attribute_group aux_clock_enable_attr_group = {
static int __init tk_aux_sysfs_init(void)
{
struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
+ int ret = -ENOMEM;
if (!tko)
- return -ENOMEM;
+ return ret;
auxo = kobject_create_and_add("aux_clocks", tko);
- if (!auxo) {
- kobject_put(tko);
- return -ENOMEM;
- }
+ if (!auxo)
+ goto err_clean;
- for (int i = 0; i <= MAX_AUX_CLOCKS; i++) {
+ for (int i = 0; i < MAX_AUX_CLOCKS; i++) {
char id[2] = { [0] = '0' + i, };
struct kobject *clk = kobject_create_and_add(id, auxo);
- if (!clk)
- return -ENOMEM;
-
- int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
+ if (!clk) {
+ ret = -ENOMEM;
+ goto err_clean;
+ }
+ ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
if (ret)
- return ret;
+ goto err_clean;
}
return 0;
+
+err_clean:
+ kobject_put(auxo);
+ kobject_put(tko);
+ return ret;
}
late_initcall(tk_aux_sysfs_init);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 553fa469d7cc..1f2364126894 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1458,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
base = lock_timer_base(timer, &flags);
- if (base->running_timer != timer)
+ if (base->running_timer != timer) {
ret = detach_if_pending(timer, base, true);
- if (shutdown)
- timer->function = NULL;
+ if (shutdown)
+ timer->function = NULL;
+ }
raw_spin_unlock_irqrestore(&base->lock, flags);
@@ -2472,7 +2473,7 @@ void update_process_times(int user_tick)
run_local_timers();
rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
- if (in_irq())
+ if (in_hardirq())
irq_work_tick();
#endif
sched_tick();
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index c0c54dc5314c..18dda1aa782d 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -10,6 +10,7 @@
#include <linux/spinlock.h>
#include <linux/timerqueue.h>
#include <trace/events/ipi.h>
+#include <linux/sched/isolation.h>
#include "timer_migration.h"
#include "tick-internal.h"
@@ -420,14 +421,54 @@ static struct list_head *tmigr_level_list __read_mostly;
static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly;
+static struct tmigr_group *tmigr_root;
+
static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
+/*
+ * CPUs available for timer migration.
+ * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
+ * Additionally tmigr_available_mutex serializes set/clear operations with each other.
+ */
+static cpumask_var_t tmigr_available_cpumask;
+static DEFINE_MUTEX(tmigr_available_mutex);
+
+/* Enabled during late initcall */
+static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated);
+
#define TMIGR_NONE 0xFF
#define BIT_CNT 8
static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc)
{
- return !(tmc->tmgroup && tmc->online);
+ return !(tmc->tmgroup && tmc->available);
+}
+
+/*
+ * Returns true if @cpu should be excluded from the hierarchy as isolated.
+ * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs
+ * are still part of the hierarchy but become idle (from a tick and timer
+ * migration perspective) when they stop their tick. This lets the timekeeping
+ * CPU handle their global timers. Marking also isolated CPUs as idle would be
+ * too costly, hence they are completely excluded from the hierarchy.
+ * This check is necessary, for instance, to prevent offline isolated CPUs from
+ * being incorrectly marked as available once getting back online.
+ *
+ * This function returns false during early boot and the isolation logic is
+ * enabled only after isolated CPUs are marked as unavailable at late boot.
+ * The tick CPU can be isolated at boot, however we cannot mark it as
+ * unavailable to avoid having no global migrator for the nohz_full CPUs. This
+ * should be ensured by the callers of this function: implicitly from hotplug
+ * callbacks and explicitly in tmigr_init_isolation() and
+ * tmigr_isolated_exclude_cpumask().
+ */
+static inline bool tmigr_is_isolated(int cpu)
+{
+ if (!static_branch_unlikely(&tmigr_exclude_isolated))
+ return false;
+ return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) ||
+ cpuset_cpu_is_isolated(cpu)) &&
+ housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE);
}
/*
@@ -502,11 +543,6 @@ static bool tmigr_check_lonely(struct tmigr_group *group)
* @now: timer base monotonic
* @check: is set if there is the need to handle remote timers;
* required in tmigr_requires_handle_remote() only
- * @tmc_active: this flag indicates, whether the CPU which triggers
- * the hierarchy walk is !idle in the timer migration
- * hierarchy. When the CPU is idle and the whole hierarchy is
- * idle, only the first event of the top level has to be
- * considered.
*/
struct tmigr_walk {
u64 nextexp;
@@ -517,16 +553,13 @@ struct tmigr_walk {
unsigned long basej;
u64 now;
bool check;
- bool tmc_active;
};
typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *);
-static void __walk_groups(up_f up, struct tmigr_walk *data,
- struct tmigr_cpu *tmc)
+static void __walk_groups_from(up_f up, struct tmigr_walk *data,
+ struct tmigr_group *child, struct tmigr_group *group)
{
- struct tmigr_group *child = NULL, *group = tmc->tmgroup;
-
do {
WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
@@ -544,6 +577,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
} while (group);
}
+static void __walk_groups(up_f up, struct tmigr_walk *data,
+ struct tmigr_cpu *tmc)
+{
+ __walk_groups_from(up, data, NULL, tmc->tmgroup);
+}
+
static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc)
{
lockdep_assert_held(&tmc->lock);
@@ -708,7 +747,7 @@ void tmigr_cpu_activate(void)
/*
* Returns true, if there is nothing to be propagated to the next level
*
- * @data->firstexp is set to expiry of first gobal event of the (top level of
+ * @data->firstexp is set to expiry of first global event of the (top level of
* the) hierarchy, but only when hierarchy is completely idle.
*
* The child and group states need to be read under the lock, to prevent a race
@@ -926,7 +965,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
* updated the event takes care when hierarchy is completely
* idle. Otherwise the migrator does it as the event is enqueued.
*/
- if (!tmc->online || tmc->remote || tmc->cpuevt.ignore ||
+ if (!tmc->available || tmc->remote || tmc->cpuevt.ignore ||
now < tmc->cpuevt.nextevt.expires) {
raw_spin_unlock_irq(&tmc->lock);
return;
@@ -973,7 +1012,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
* (See also section "Required event and timerqueue update after a
* remote expiry" in the documentation at the top)
*/
- if (!tmc->online || !tmc->idle) {
+ if (!tmc->available || !tmc->idle) {
timer_unlock_remote_bases(cpu);
goto unlock;
}
@@ -1113,15 +1152,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
*/
if (!tmigr_check_migrator(group, childmask))
return true;
-
- /*
- * When there is a parent group and the CPU which triggered the
- * hierarchy walk is not active, proceed the walk to reach the top level
- * group before reading the next_expiry value.
- */
- if (group->parent && !data->tmc_active)
- return false;
-
/*
* The lock is required on 32bit architectures to read the variable
* consistently with a concurrent writer. On 64bit the lock is not
@@ -1166,7 +1196,6 @@ bool tmigr_requires_handle_remote(void)
data.now = get_jiffies_update(&jif);
data.childmask = tmc->groupmask;
data.firstexp = KTIME_MAX;
- data.tmc_active = !tmc->idle;
data.check = false;
/*
@@ -1432,38 +1461,43 @@ static long tmigr_trigger_active(void *unused)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
- WARN_ON_ONCE(!tmc->online || tmc->idle);
+ WARN_ON_ONCE(!tmc->available || tmc->idle);
return 0;
}
-static int tmigr_cpu_offline(unsigned int cpu)
+static int tmigr_clear_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
int migrator;
u64 firstexp;
- raw_spin_lock_irq(&tmc->lock);
- tmc->online = false;
- WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+ guard(mutex)(&tmigr_available_mutex);
- /*
- * CPU has to handle the local events on his own, when on the way to
- * offline; Therefore nextevt value is set to KTIME_MAX
- */
- firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
- trace_tmigr_cpu_offline(tmc);
- raw_spin_unlock_irq(&tmc->lock);
+ cpumask_clear_cpu(cpu, tmigr_available_cpumask);
+ scoped_guard(raw_spinlock_irq, &tmc->lock) {
+ if (!tmc->available)
+ return 0;
+ tmc->available = false;
+ WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+
+ /*
+ * CPU has to handle the local events on his own, when on the way to
+ * offline; Therefore nextevt value is set to KTIME_MAX
+ */
+ firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
+ trace_tmigr_cpu_unavailable(tmc);
+ }
if (firstexp != KTIME_MAX) {
- migrator = cpumask_any_but(cpu_online_mask, cpu);
+ migrator = cpumask_any(tmigr_available_cpumask);
work_on_cpu(migrator, tmigr_trigger_active, NULL);
}
return 0;
}
-static int tmigr_cpu_online(unsigned int cpu)
+static int tmigr_set_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1471,16 +1505,123 @@ static int tmigr_cpu_online(unsigned int cpu)
if (WARN_ON_ONCE(!tmc->tmgroup))
return -EINVAL;
- raw_spin_lock_irq(&tmc->lock);
- trace_tmigr_cpu_online(tmc);
- tmc->idle = timer_base_is_idle();
- if (!tmc->idle)
- __tmigr_cpu_activate(tmc);
- tmc->online = true;
- raw_spin_unlock_irq(&tmc->lock);
+ if (tmigr_is_isolated(cpu))
+ return 0;
+
+ guard(mutex)(&tmigr_available_mutex);
+
+ cpumask_set_cpu(cpu, tmigr_available_cpumask);
+ scoped_guard(raw_spinlock_irq, &tmc->lock) {
+ if (tmc->available)
+ return 0;
+ trace_tmigr_cpu_available(tmc);
+ tmc->idle = timer_base_is_idle();
+ if (!tmc->idle)
+ __tmigr_cpu_activate(tmc);
+ tmc->available = true;
+ }
return 0;
}
+static void tmigr_cpu_isolate(struct work_struct *ignored)
+{
+ tmigr_clear_cpu_available(smp_processor_id());
+}
+
+static void tmigr_cpu_unisolate(struct work_struct *ignored)
+{
+ tmigr_set_cpu_available(smp_processor_id());
+}
+
+/**
+ * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy
+ * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy
+ *
+ * This function can be called from cpuset code to provide the new set of
+ * isolated CPUs that should be excluded from the hierarchy.
+ * Online CPUs not present in exclude_cpumask but already excluded are brought
+ * back to the hierarchy.
+ * Functions to isolate/unisolate need to be called locally and can sleep.
+ */
+int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
+{
+ struct work_struct __percpu *works __free(free_percpu) =
+ alloc_percpu(struct work_struct);
+ cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ int cpu;
+
+ lockdep_assert_cpus_held();
+
+ if (!works)
+ return -ENOMEM;
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ /*
+ * First set previously isolated CPUs as available (unisolate).
+ * This cpumask contains only CPUs that switched to available now.
+ */
+ cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask);
+ cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask);
+
+ for_each_cpu(cpu, cpumask) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, tmigr_cpu_unisolate);
+ schedule_work_on(cpu, work);
+ }
+ for_each_cpu(cpu, cpumask)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ /*
+ * Then clear previously available CPUs (isolate).
+ * This cpumask contains only CPUs that switched to not available now.
+ * There cannot be overlap with the newly available ones.
+ */
+ cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask);
+ cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+ /*
+ * Handle this here and not in the cpuset code because exclude_cpumask
+ * might include also the tick CPU if included in isolcpus.
+ */
+ for_each_cpu(cpu, cpumask) {
+ if (!tick_nohz_cpu_hotpluggable(cpu)) {
+ cpumask_clear_cpu(cpu, cpumask);
+ break;
+ }
+ }
+
+ for_each_cpu(cpu, cpumask) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, tmigr_cpu_isolate);
+ schedule_work_on(cpu, work);
+ }
+ for_each_cpu(cpu, cpumask)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ return 0;
+}
+
+static int __init tmigr_init_isolation(void)
+{
+ cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+
+ static_branch_enable(&tmigr_exclude_isolated);
+
+ if (!housekeeping_enabled(HK_TYPE_DOMAIN))
+ return 0;
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ /* Protect against RCU torture hotplug testing */
+ guard(cpus_read_lock)();
+ return tmigr_isolated_exclude_cpumask(cpumask);
+}
+late_initcall(tmigr_init_isolation);
+
static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
int node)
{
@@ -1498,21 +1639,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
s.seq = 0;
atomic_set(&group->migr_state, s.state);
- /*
- * If this is a new top-level, prepare its groupmask in advance.
- * This avoids accidents where yet another new top-level is
- * created in the future and made visible before the current groupmask.
- */
- if (list_empty(&tmigr_level_list[lvl])) {
- group->groupmask = BIT(0);
- /*
- * The previous top level has prepared its groupmask already,
- * simply account it as the first child.
- */
- if (lvl > 0)
- group->num_children = 1;
- }
-
timerqueue_init_head(&group->events);
timerqueue_init(&group->groupevt.nextevt);
group->groupevt.nextevt.expires = KTIME_MAX;
@@ -1520,8 +1646,7 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
group->groupevt.ignore = true;
}
-static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
- unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
{
struct tmigr_group *tmp, *group = NULL;
@@ -1567,25 +1692,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
return group;
}
+static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+{
+ if (!group->parent && group != tmigr_root) {
+ /*
+ * This is the new top-level, prepare its groupmask in advance
+ * to avoid accidents where yet another new top-level is
+ * created in the future and made visible before this groupmask.
+ */
+ group->groupmask = BIT(0);
+ WARN_ON_ONCE(activate);
+
+ return true;
+ }
+
+ return false;
+
+}
+
static void tmigr_connect_child_parent(struct tmigr_group *child,
struct tmigr_group *parent,
bool activate)
{
- struct tmigr_walk data;
-
- raw_spin_lock_irq(&child->lock);
- raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
+ if (tmigr_init_root(parent, activate)) {
+ /*
+ * The previous top level had prepared its groupmask already,
+ * simply account it in advance as the first child. If some groups
+ * have been created between the old and new root due to node
+ * mismatch, the new root's child will be intialized accordingly.
+ */
+ parent->num_children = 1;
+ }
- if (activate) {
+ /* Connecting old root to new root ? */
+ if (!parent->parent && activate) {
/*
- * @child is the old top and @parent the new one. In this
- * case groupmask is pre-initialized and @child already
- * accounted, along with its new sibling corresponding to the
- * CPU going up.
+ * @child is the old top, or in case of node mismatch, some
+ * intermediate group between the old top and the new one in
+ * @parent. In this case the @child must be pre-accounted above
+ * as the first child. Its new inactive sibling corresponding
+ * to the CPU going up has been accounted as the second child.
*/
- WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
+ WARN_ON_ONCE(parent->num_children != 2);
+ child->groupmask = BIT(0);
} else {
- /* Adding @child for the CPU going up to @parent. */
+ /* Common case adding @child for the CPU going up to @parent. */
child->groupmask = BIT(parent->num_children++);
}
@@ -1596,87 +1747,61 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
*/
smp_store_release(&child->parent, parent);
- raw_spin_unlock(&parent->lock);
- raw_spin_unlock_irq(&child->lock);
-
trace_tmigr_connect_child_parent(child);
-
- if (!activate)
- return;
-
- /*
- * To prevent inconsistent states, active children need to be active in
- * the new parent as well. Inactive children are already marked inactive
- * in the parent group:
- *
- * * When new groups were created by tmigr_setup_groups() starting from
- * the lowest level (and not higher then one level below the current
- * top level), then they are not active. They will be set active when
- * the new online CPU comes active.
- *
- * * But if a new group above the current top level is required, it is
- * mandatory to propagate the active state of the already existing
- * child to the new parent. So tmigr_connect_child_parent() is
- * executed with the formerly top level group (child) and the newly
- * created group (parent).
- *
- * * It is ensured that the child is active, as this setup path is
- * executed in hotplug prepare callback. This is exectued by an
- * already connected and !idle CPU. Even if all other CPUs go idle,
- * the CPU executing the setup will be responsible up to current top
- * level group. And the next time it goes inactive, it will release
- * the new childmask and parent to subsequent walkers through this
- * @child. Therefore propagate active state unconditionally.
- */
- data.childmask = child->groupmask;
-
- /*
- * There is only one new level per time (which is protected by
- * tmigr_mutex). When connecting the child and the parent and set the
- * child active when the parent is inactive, the parent needs to be the
- * uppermost level. Otherwise there went something wrong!
- */
- WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
}
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
+static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
+ struct tmigr_group *start, bool activate)
{
struct tmigr_group *group, *child, **stack;
- int top = 0, err = 0, i = 0;
- struct list_head *lvllist;
+ int i, top = 0, err = 0, start_lvl = 0;
+ bool root_mismatch = false;
stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL);
if (!stack)
return -ENOMEM;
- do {
- group = tmigr_get_group(cpu, node, i);
+ if (start) {
+ stack[start->level] = start;
+ start_lvl = start->level + 1;
+ }
+
+ if (tmigr_root)
+ root_mismatch = tmigr_root->numa_node != node;
+
+ for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
+ group = tmigr_get_group(node, i);
if (IS_ERR(group)) {
err = PTR_ERR(group);
+ i--;
break;
}
top = i;
- stack[i++] = group;
+ stack[i] = group;
/*
* When booting only less CPUs of a system than CPUs are
- * available, not all calculated hierarchy levels are required.
+ * available, not all calculated hierarchy levels are required,
+ * unless a node mismatch is detected.
*
* The loop is aborted as soon as the highest level, which might
* be different from tmigr_hierarchy_levels, contains only a
- * single group.
+ * single group, unless the nodes mismatch below tmigr_crossnode_level
*/
- if (group->parent || list_is_singular(&tmigr_level_list[i - 1]))
+ if (group->parent)
break;
+ if ((!root_mismatch || i >= tmigr_crossnode_level) &&
+ list_is_singular(&tmigr_level_list[i]))
+ break;
+ }
- } while (i < tmigr_hierarchy_levels);
-
- /* Assert single root */
- WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top]));
+ /* Assert single root without parent */
+ if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels))
+ return -EINVAL;
- while (i > 0) {
- group = stack[--i];
+ for (; i >= start_lvl; i--) {
+ group = stack[i];
if (err < 0) {
list_del(&group->list);
@@ -1692,12 +1817,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
if (i == 0) {
struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
- raw_spin_lock_irq(&group->lock);
-
tmc->tmgroup = group;
tmc->groupmask = BIT(group->num_children++);
- raw_spin_unlock_irq(&group->lock);
+ tmigr_init_root(group, activate);
trace_tmigr_connect_cpu_parent(tmc);
@@ -1705,42 +1828,58 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
continue;
} else {
child = stack[i - 1];
- /* Will be activated at online time */
- tmigr_connect_child_parent(child, group, false);
+ tmigr_connect_child_parent(child, group, activate);
}
+ }
- /* check if uppermost level was newly created */
- if (top != i)
- continue;
-
- WARN_ON_ONCE(top == 0);
+ if (err < 0)
+ goto out;
- lvllist = &tmigr_level_list[top];
+ if (activate) {
+ struct tmigr_walk data;
+ union tmigr_state state;
/*
- * Newly created root level should have accounted the upcoming
- * CPU's child group and pre-accounted the old root.
+ * To prevent inconsistent states, active children need to be active in
+ * the new parent as well. Inactive children are already marked inactive
+ * in the parent group:
+ *
+ * * When new groups were created by tmigr_setup_groups() starting from
+ * the lowest level, then they are not active. They will be set active
+ * when the new online CPU comes active.
+ *
+ * * But if new groups above the current top level are required, it is
+ * mandatory to propagate the active state of the already existing
+ * child to the new parents. So tmigr_active_up() activates the
+ * new parents while walking up from the old root to the new.
+ *
+ * * It is ensured that @start is active, as this setup path is
+ * executed in hotplug prepare callback. This is executed by an
+ * already connected and !idle CPU. Even if all other CPUs go idle,
+ * the CPU executing the setup will be responsible up to current top
+ * level group. And the next time it goes inactive, it will release
+ * the new childmask and parent to subsequent walkers through this
+ * @child. Therefore propagate active state unconditionally.
*/
- if (group->num_children == 2 && list_is_singular(lvllist)) {
- /*
- * The target CPU must never do the prepare work, except
- * on early boot when the boot CPU is the target. Otherwise
- * it may spuriously activate the old top level group inside
- * the new one (nevertheless whether old top level group is
- * active or not) and/or release an uninitialized childmask.
- */
- WARN_ON_ONCE(cpu == raw_smp_processor_id());
-
- lvllist = &tmigr_level_list[top - 1];
- list_for_each_entry(child, lvllist, list) {
- if (child->parent)
- continue;
+ state.state = atomic_read(&start->migr_state);
+ WARN_ON_ONCE(!state.active);
+ WARN_ON_ONCE(!start->parent);
+ data.childmask = start->groupmask;
+ __walk_groups_from(tmigr_active_up, &data, start, start->parent);
+ }
- tmigr_connect_child_parent(child, group, true);
- }
+ /* Root update */
+ if (list_is_singular(&tmigr_level_list[top])) {
+ group = list_first_entry(&tmigr_level_list[top],
+ typeof(*group), list);
+ WARN_ON_ONCE(group->parent);
+ if (tmigr_root) {
+ /* Old root should be the same or below */
+ WARN_ON_ONCE(tmigr_root->level > top);
}
+ tmigr_root = group;
}
-
+out:
kfree(stack);
return err;
@@ -1748,12 +1887,31 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
static int tmigr_add_cpu(unsigned int cpu)
{
+ struct tmigr_group *old_root = tmigr_root;
int node = cpu_to_node(cpu);
int ret;
- mutex_lock(&tmigr_mutex);
- ret = tmigr_setup_groups(cpu, node);
- mutex_unlock(&tmigr_mutex);
+ guard(mutex)(&tmigr_mutex);
+
+ ret = tmigr_setup_groups(cpu, node, NULL, false);
+
+ /* Root has changed? Connect the old one to the new */
+ if (ret >= 0 && old_root && old_root != tmigr_root) {
+ /*
+ * The target CPU must never do the prepare work, except
+ * on early boot when the boot CPU is the target. Otherwise
+ * it may spuriously activate the old top level group inside
+ * the new one (nevertheless whether old top level group is
+ * active or not) and/or release an uninitialized childmask.
+ */
+ WARN_ON_ONCE(cpu == raw_smp_processor_id());
+ /*
+ * The (likely) current CPU is expected to be online in the hierarchy,
+ * otherwise the old root may not be active as expected.
+ */
+ WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
+ ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+ }
return ret;
}
@@ -1798,6 +1956,11 @@ static int __init tmigr_init(void)
if (ncpus == 1)
return 0;
+ if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
/*
* Calculate the required hierarchy levels. Unfortunately there is no
* reliable information available, unless all possible CPUs have been
@@ -1847,7 +2010,7 @@ static int __init tmigr_init(void)
goto err;
ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online",
- tmigr_cpu_online, tmigr_cpu_offline);
+ tmigr_set_cpu_available, tmigr_clear_cpu_available);
if (ret)
goto err;
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index ae19f70f8170..70879cde6fdd 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -97,7 +97,7 @@ struct tmigr_group {
*/
struct tmigr_cpu {
raw_spinlock_t lock;
- bool online;
+ bool available;
bool idle;
bool remote;
struct tmigr_group *tmgroup;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 42bd2ba68a82..59cfacb8a5bb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1971,7 +1971,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops)
*/
static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
struct ftrace_hash *old_hash,
- struct ftrace_hash *new_hash)
+ struct ftrace_hash *new_hash,
+ bool update_target)
{
struct ftrace_page *pg;
struct dyn_ftrace *rec, *end = NULL;
@@ -2006,10 +2007,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
if (rec->flags & FTRACE_FL_DISABLED)
continue;
- /* We need to update only differences of filter_hash */
+ /*
+ * Unless we are updating the target of a direct function,
+ * we only need to update differences of filter_hash
+ */
in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
- if (in_old == in_new)
+ if (!update_target && (in_old == in_new))
continue;
if (in_new) {
@@ -2020,7 +2024,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
if (is_ipmodify)
goto rollback;
- FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+ /*
+ * If this is called by __modify_ftrace_direct()
+ * then it is only changing where the direct
+ * pointer is jumping to, and the record already
+ * points to a direct trampoline. If it isn't,
+ * then it is a bug to update ipmodify on a direct
+ * caller.
+ */
+ FTRACE_WARN_ON(!update_target &&
+ (rec->flags & FTRACE_FL_DIRECT));
/*
* Another ops with IPMODIFY is already
@@ -2076,7 +2089,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
if (ftrace_hash_empty(hash))
hash = NULL;
- return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
+ return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false);
}
/* Disabling always succeeds */
@@ -2087,7 +2100,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
if (ftrace_hash_empty(hash))
hash = NULL;
- __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
+ __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false);
}
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
@@ -2101,7 +2114,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
if (ftrace_hash_empty(new_hash))
new_hash = NULL;
- return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
+ return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false);
}
static void print_ip_ins(const char *fmt, const unsigned char *p)
@@ -5953,6 +5966,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp)
free_ftrace_hash(fhp);
}
+static void reset_direct(struct ftrace_ops *ops, unsigned long addr)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ remove_direct_functions_hash(hash, addr);
+
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+}
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* for multiple functions registered in @ops
@@ -6048,6 +6072,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
ops->direct_call = addr;
err = register_ftrace_function_nolock(ops);
+ if (err)
+ reset_direct(ops, addr);
out_unlock:
mutex_unlock(&direct_mutex);
@@ -6080,7 +6106,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct);
int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
bool free_filters)
{
- struct ftrace_hash *hash = ops->func_hash->filter_hash;
int err;
if (check_direct_multi(ops))
@@ -6090,13 +6115,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
mutex_lock(&direct_mutex);
err = unregister_ftrace_function(ops);
- remove_direct_functions_hash(hash, addr);
+ reset_direct(ops, addr);
mutex_unlock(&direct_mutex);
- /* cleanup for possible another register call */
- ops->func = NULL;
- ops->trampoline = 0;
-
if (free_filters)
ftrace_free_filter(ops);
return err;
@@ -6106,7 +6127,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
static int
__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
- struct ftrace_hash *hash;
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
struct ftrace_func_entry *entry, *iter;
static struct ftrace_ops tmp_ops = {
.func = ftrace_stub,
@@ -6127,12 +6148,20 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
return err;
/*
+ * Call __ftrace_hash_update_ipmodify() here, so that we can call
+ * ops->ops_func for the ops. This is needed because the above
+ * register_ftrace_function_nolock() worked on tmp_ops.
+ */
+ err = __ftrace_hash_update_ipmodify(ops, hash, hash, true);
+ if (err)
+ goto out;
+
+ /*
* Now the ftrace_ops_list_func() is called to do the direct callers.
* We can safely change the direct functions attached to each entry.
*/
mutex_lock(&ftrace_lock);
- hash = ops->func_hash->filter_hash;
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(iter, &hash->buckets[i], hlist) {
@@ -6147,6 +6176,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
mutex_unlock(&ftrace_lock);
+out:
/* Removing the tmp_ops will add the updated direct callers to the functions */
unregister_ftrace_function(&tmp_ops);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1244d2c5c384..afcd3747264d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7344,6 +7344,10 @@ consume:
goto out;
}
+ /* Did the reader catch up with the writer? */
+ if (cpu_buffer->reader_page == cpu_buffer->commit_page)
+ goto out;
+
reader = rb_get_reader_page(cpu_buffer);
if (WARN_ON(!reader))
goto out;
diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig
index 5e16625f1653..0e013f00c33b 100644
--- a/kernel/trace/rv/monitors/pagefault/Kconfig
+++ b/kernel/trace/rv/monitors/pagefault/Kconfig
@@ -5,6 +5,7 @@ config RV_MON_PAGEFAULT
select RV_LTL_MONITOR
depends on RV_MON_RTAPP
depends on X86 || RISCV
+ depends on MMU
default y
select LTL_MON_EVENTS_ID
bool "pagefault monitor"
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 48338520376f..43e9ea473cda 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -501,7 +501,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
list_for_each_entry_continue(mon, &rv_monitors_list, list) {
if (mon->enabled)
- return mon;
+ return &mon->list;
}
return NULL;
@@ -509,7 +509,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
{
- struct rv_monitor *mon;
+ struct list_head *head;
loff_t l;
mutex_lock(&rv_interface_lock);
@@ -517,15 +517,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
if (list_empty(&rv_monitors_list))
return NULL;
- mon = list_entry(&rv_monitors_list, struct rv_monitor, list);
+ head = &rv_monitors_list;
for (l = 0; l <= *pos; ) {
- mon = enabled_monitors_next(m, mon, &l);
- if (!mon)
+ head = enabled_monitors_next(m, head, &l);
+ if (!head)
break;
}
- return mon;
+ return head;
}
/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d1e527cf2aae..304e93597126 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8781,8 +8781,18 @@ static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
put_snapshot_map(iter->tr);
}
+static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Trace buffer mappings require the complete buffer including
+ * the meta page. Partial mappings are not supported.
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct tracing_buffers_vmops = {
.close = tracing_buffers_mmap_close,
+ .may_split = tracing_buffers_may_split,
};
static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1d536219b624..6bfaf1210dd2 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -3272,14 +3272,16 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
var = create_var(hist_data, file, field_name, val->size, val->type);
if (IS_ERR(var)) {
hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name));
- kfree(val);
+ destroy_hist_field(val, 0);
ret = PTR_ERR(var);
goto err;
}
field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
if (!field_var) {
- kfree(val);
+ destroy_hist_field(val, 0);
+ kfree_const(var->type);
+ kfree(var->var.name);
kfree(var);
ret = -ENOMEM;
goto err;
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index c428dafe7496..b15854c75d4f 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1449,12 +1449,7 @@ static struct trace_event_functions user_event_funcs = {
static int user_event_set_call_visible(struct user_event *user, bool visible)
{
- int ret;
- const struct cred *old_cred;
- struct cred *cred;
-
- cred = prepare_creds();
-
+ CLASS(prepare_creds, cred)();
if (!cred)
return -ENOMEM;
@@ -1469,17 +1464,12 @@ static int user_event_set_call_visible(struct user_event *user, bool visible)
*/
cred->fsuid = GLOBAL_ROOT_UID;
- old_cred = override_creds(cred);
-
- if (visible)
- ret = trace_add_event_call(&user->call);
- else
- ret = trace_remove_event_call(&user->call);
-
- revert_creds(old_cred);
- put_cred(cred);
+ scoped_with_creds(cred) {
+ if (visible)
+ return trace_add_event_call(&user->call);
- return ret;
+ return trace_remove_event_call(&user->call);
+ }
}
static int destroy_user_event(struct user_event *user)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index ad9d6347b5fa..8001dbf16891 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -106,13 +106,14 @@ static struct tracepoint_user *__tracepoint_user_init(const char *name, struct t
if (!tuser->name)
return NULL;
+ /* Register tracepoint if it is loaded. */
if (tpoint) {
+ tuser->tpoint = tpoint;
ret = tracepoint_user_register(tuser);
if (ret)
return ERR_PTR(ret);
}
- tuser->tpoint = tpoint;
tuser->refcount = 1;
INIT_LIST_HEAD(&tuser->list);
list_add(&tuser->list, &tracepoint_user_list);
@@ -1513,6 +1514,10 @@ static int disable_trace_fprobe(struct trace_event_call *call,
if (!trace_probe_is_enabled(tp)) {
list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
unregister_fprobe(&tf->fp);
+ if (tf->tuser) {
+ tracepoint_user_put(tf->tuser);
+ tf->tuser = NULL;
+ }
}
}
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index dc6040aae3ee..a88fb481c4a3 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -53,7 +53,7 @@ DEFINE_STATIC_SRCU(unwind_srcu);
static inline bool unwind_pending(struct unwind_task_info *info)
{
- return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
+ return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING;
}
/*
@@ -79,6 +79,8 @@ static u64 get_cookie(struct unwind_task_info *info)
{
u32 cnt = 1;
+ lockdep_assert_irqs_disabled();
+
if (info->id.cpu)
return info->id.id;
@@ -126,23 +128,20 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
cache = info->cache;
trace->entries = cache->entries;
-
- if (cache->nr_entries) {
- /*
- * The user stack has already been previously unwound in this
- * entry context. Skip the unwind and use the cache.
- */
- trace->nr = cache->nr_entries;
+ trace->nr = cache->nr_entries;
+ /*
+ * The user stack has already been previously unwound in this
+ * entry context. Skip the unwind and use the cache.
+ */
+ if (trace->nr)
return 0;
- }
- trace->nr = 0;
unwind_user(trace, UNWIND_MAX_ENTRIES);
cache->nr_entries = trace->nr;
/* Clear nr_entries on way back to user space */
- set_bit(UNWIND_USED_BIT, &info->unwind_mask);
+ atomic_long_or(UNWIND_USED, &info->unwind_mask);
return 0;
}
@@ -160,7 +159,7 @@ static void process_unwind_deferred(struct task_struct *task)
/* Clear pending bit but make sure to have the current bits */
bits = atomic_long_fetch_andnot(UNWIND_PENDING,
- (atomic_long_t *)&info->unwind_mask);
+ &info->unwind_mask);
/*
* From here on out, the callback must always be called, even if it's
* just an empty trace.
@@ -231,6 +230,7 @@ void unwind_deferred_task_exit(struct task_struct *task)
int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
{
struct unwind_task_info *info = &current->unwind_info;
+ int twa_mode = TWA_RESUME;
unsigned long old, bits;
unsigned long bit;
int ret;
@@ -246,8 +246,11 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
* Trigger a warning to make it obvious that an architecture
* is using this in NMI when it should not be.
*/
- if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
- return -EINVAL;
+ if (in_nmi()) {
+ if (WARN_ON_ONCE(!CAN_USE_IN_NMI))
+ return -EINVAL;
+ twa_mode = TWA_NMI_CURRENT;
+ }
/* Do not allow cancelled works to request again */
bit = READ_ONCE(work->bit);
@@ -261,7 +264,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
*cookie = get_cookie(info);
- old = READ_ONCE(info->unwind_mask);
+ old = atomic_long_read(&info->unwind_mask);
/* Is this already queued or executed */
if (old & bit)
@@ -274,7 +277,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
* to have a callback.
*/
bits = UNWIND_PENDING | bit;
- old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
+ old = atomic_long_fetch_or(bits, &info->unwind_mask);
if (old & bits) {
/*
* If the work's bit was set, whatever set it had better
@@ -285,10 +288,10 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
}
/* The work has been claimed, now schedule it. */
- ret = task_work_add(current, &info->work, TWA_RESUME);
+ ret = task_work_add(current, &info->work, twa_mode);
if (WARN_ON_ONCE(ret))
- WRITE_ONCE(info->unwind_mask, 0);
+ atomic_long_set(&info->unwind_mask, 0);
return ret;
}
@@ -320,7 +323,8 @@ void unwind_deferred_cancel(struct unwind_work *work)
guard(rcu)();
/* Clear this bit from all threads */
for_each_process_thread(g, t) {
- clear_bit(bit, &t->unwind_info.unwind_mask);
+ atomic_long_andnot(BIT(bit),
+ &t->unwind_info.unwind_mask);
if (t->unwind_info.cache)
clear_bit(bit, &t->unwind_info.cache->unwind_completed);
}
@@ -350,7 +354,7 @@ void unwind_task_init(struct task_struct *task)
memset(info, 0, sizeof(*info));
init_task_work(&info->work, unwind_deferred_task_work);
- info->unwind_mask = 0;
+ atomic_long_set(&info->unwind_mask, 0);
}
void unwind_task_free(struct task_struct *task)
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 97a8415e3216..39e270789444 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -8,18 +8,28 @@
#include <linux/unwind_user.h>
#include <linux/uaccess.h>
-static const struct unwind_user_frame fp_frame = {
- ARCH_INIT_USER_FP_FRAME
-};
-
#define for_each_user_frame(state) \
for (unwind_user_start(state); !(state)->done; unwind_user_next(state))
-static int unwind_user_next_fp(struct unwind_user_state *state)
+static inline int
+get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws)
+{
+ unsigned long __user *addr = (void __user *)base + off;
+#ifdef CONFIG_COMPAT
+ if (ws == sizeof(int)) {
+ unsigned int data;
+ int ret = get_user(data, (unsigned int __user *)addr);
+ *word = data;
+ return ret;
+ }
+#endif
+ return get_user(*word, addr);
+}
+
+static int unwind_user_next_common(struct unwind_user_state *state,
+ const struct unwind_user_frame *frame)
{
- const struct unwind_user_frame *frame = &fp_frame;
unsigned long cfa, fp, ra;
- unsigned int shift;
if (frame->use_fp) {
if (state->fp < state->sp)
@@ -37,24 +47,45 @@ static int unwind_user_next_fp(struct unwind_user_state *state)
return -EINVAL;
/* Make sure that the address is word aligned */
- shift = sizeof(long) == 4 ? 2 : 3;
- if (cfa & ((1 << shift) - 1))
+ if (cfa & (state->ws - 1))
return -EINVAL;
/* Find the Return Address (RA) */
- if (get_user(ra, (unsigned long *)(cfa + frame->ra_off)))
+ if (get_user_word(&ra, cfa, frame->ra_off, state->ws))
return -EINVAL;
- if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off)))
+ if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws))
return -EINVAL;
state->ip = ra;
state->sp = cfa;
if (frame->fp_off)
state->fp = fp;
+ state->topmost = false;
return 0;
}
+static int unwind_user_next_fp(struct unwind_user_state *state)
+{
+#ifdef CONFIG_HAVE_UNWIND_USER_FP
+ struct pt_regs *regs = task_pt_regs(current);
+
+ if (state->topmost && unwind_user_at_function_start(regs)) {
+ const struct unwind_user_frame fp_entry_frame = {
+ ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws)
+ };
+ return unwind_user_next_common(state, &fp_entry_frame);
+ }
+
+ const struct unwind_user_frame fp_frame = {
+ ARCH_INIT_USER_FP_FRAME(state->ws)
+ };
+ return unwind_user_next_common(state, &fp_frame);
+#else
+ return -EINVAL;
+#endif
+}
+
static int unwind_user_next(struct unwind_user_state *state)
{
unsigned long iter_mask = state->available_types;
@@ -102,6 +133,12 @@ static int unwind_user_start(struct unwind_user_state *state)
state->ip = instruction_pointer(regs);
state->sp = user_stack_pointer(regs);
state->fp = frame_pointer(regs);
+ state->ws = unwind_user_word_size(regs);
+ if (!state->ws) {
+ state->done = true;
+ return -EINVAL;
+ }
+ state->topmost = true;
return 0;
}
diff --git a/kernel/user.c b/kernel/user.c
index 0163665914c9..7aef4e679a6a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -35,6 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc);
* and 1 for... ?
*/
struct user_namespace init_user_ns = {
+ .ns = NS_COMMON_INIT(init_user_ns),
.uid_map = {
{
.extent[0] = {
@@ -65,14 +66,8 @@ struct user_namespace init_user_ns = {
.nr_extents = 1,
},
},
- .ns.ns_type = ns_common_type(&init_user_ns),
- .ns.__ns_ref = REFCOUNT_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
- .ns.inum = ns_init_inum(&init_user_ns),
-#ifdef CONFIG_USER_NS
- .ns.ops = &userns_operations,
-#endif
.flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_KEYS
.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 7e45559521af..52f89f1137da 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -119,9 +119,9 @@ static bool post_one_notification(struct watch_queue *wqueue,
offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE;
get_page(page);
len = n->info & WATCH_INFO_LENGTH;
- p = kmap_atomic(page);
+ p = kmap_local_page(page);
memcpy(p + offset, n, len);
- kunmap_atomic(p);
+ kunmap_local(p);
buf = pipe_buf(pipe, head);
buf->page = page;